Files
cruciverba_1/curate_lexicon_alpha.py

612 lines
21 KiB
Python

from __future__ import annotations
import argparse
import json
import re
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")
DIFFICULTIES = ("easy", "medium", "hard", "expert")
TEXT_REPLACEMENTS = {
" ngrandimento": " ingrandimento",
"superificie": "superficie",
"quantitaaa": "quantità",
"quantitaaaa": "quantità",
"quantit": "quantità",
"sanit_militare": "sanità_militare",
" unaparola ": " una parola ",
"questa parola, ": "",
"questa parola; ": "",
}
SUSPICIOUS_PROPER_PATTERNS = (
r"\bepisodio\b",
r"\bfilm\b",
r"\bserie tv\b",
r"\bfamiglia\b",
r"\bcomune italiano\b",
r"\bfrazione del comune\b",
r"\bcitta metropolitana\b",
r"\bpersonaggio\b",
r"\balbum\b",
r"\bcognome\b",
r"\bnome proprio\b",
)
DOMAIN_HINTS = {
"religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
"transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
"health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
"nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
"geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
"sea": ("acque", "salate", "superficie terrestre", "oceano"),
}
ABSTRACT_PATTERNS = (
r"\bgrande quantita\b",
r"\bfigurato\b",
r"\bsenso figurato\b",
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
)
parser.add_argument(
"--input",
type=Path,
default=REFINED_LEXICON_OUTPUT_PATH,
help="Lessico refined di partenza.",
)
parser.add_argument(
"--output",
type=Path,
default=CURATED_LEXICON_OUTPUT_PATH,
help="Lessico curated da generare.",
)
parser.add_argument(
"--review-output",
type=Path,
default=TO_BE_REVIEW_OUTPUT_PATH,
help="File JSON con le voci che richiedono revisione umana.",
)
parser.add_argument(
"--max-review",
type=int,
default=0,
help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
)
return parser.parse_args()
def load_json(path: Path) -> Dict[str, object]:
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: Dict[str, object]) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def dedupe(items: Iterable[str]) -> List[str]:
result: List[str] = []
seen = set()
for item in items:
text = str(item).strip()
if not text:
continue
key = text.lower()
if key in seen:
continue
seen.add(key)
result.append(text)
return result
def ascii_fold(text: str) -> str:
replacements = str.maketrans(
{
"à": "a",
"á": "a",
"è": "e",
"é": "e",
"ì": "i",
"í": "i",
"ò": "o",
"ó": "o",
"ù": "u",
"ú": "u",
}
)
return str(text).translate(replacements)
def normalize_text(text: str) -> str:
value = str(text or "").strip()
if not value:
return ""
for old, new in TEXT_REPLACEMENTS.items():
value = value.replace(old, new)
value = re.sub(r"\s+", " ", value)
value = re.sub(r"\s*;\s*", "; ", value)
value = re.sub(r"\s*,\s*", ", ", value)
value = value.strip(" .;:-")
if value and value[0].islower():
value = value[0].upper() + value[1:]
return value + "."
def split_definition_text(text: str) -> List[str]:
value = str(text or "").strip()
if not value:
return []
pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
normalized = []
for piece in pieces:
cleaned = normalize_text(piece)
if cleaned:
normalized.append(cleaned)
return normalized
def entry_is_common_word(entry: Dict[str, object]) -> bool:
form = str(entry.get("form", ""))
return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])
def definition_mentions_answer(text: str, answer: str) -> bool:
normalized_text = ascii_fold(text).lower()
normalized_answer = ascii_fold(answer).lower()
return bool(re.search(re.escape(normalized_answer), normalized_text))
def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
if not entry_is_common_word(entry):
return False
lowered = ascii_fold(text).lower()
return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)
def likely_abstract_detour(text: str) -> bool:
lowered = ascii_fold(text).lower()
return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)
def semantic_topics(entry: Dict[str, object]) -> List[str]:
semantic = entry.get("semantic", {})
topics = []
if isinstance(semantic, dict):
topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
wiktextract = entry.get("wiktextract", {})
if isinstance(wiktextract, dict):
topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
return dedupe(topics)
def lexical_topics(entry: Dict[str, object]) -> List[str]:
return [str(item).lower() for item in entry.get("topics", []) or [] if item]
def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
lowered = ascii_fold(text).lower()
score = 0
topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
for topic in topics:
for hint in DOMAIN_HINTS.get(topic, ()):
if hint in lowered:
score += 16
return score
def candidate_style(text: str) -> str:
lowered = ascii_fold(text).lower()
if ";" in text or len(text) > 90:
return "direct"
if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
return "balanced"
return "oblique"
def length_window(difficulty: str) -> Tuple[int, int]:
if difficulty == "easy":
return 18, 90
if difficulty == "medium":
return 18, 78
if difficulty == "hard":
return 14, 62
return 12, 55
def build_candidate(
text: str,
*,
source: str,
family: str,
confidence: float,
priority: int = 0,
) -> Dict[str, object]:
cleaned = normalize_text(text)
return {
"text": cleaned,
"source": source,
"family": family,
"confidence": confidence,
"style": candidate_style(cleaned),
"priority": priority,
}
def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
candidates: List[Dict[str, object]] = []
seen = set()
semantic = entry.get("semantic", {})
if isinstance(semantic, dict):
for index, synset in enumerate(semantic.get("synsets", []) or []):
if not isinstance(synset, dict):
continue
for piece in split_definition_text(str(synset.get("definition", ""))):
candidate = build_candidate(
piece,
source="semantic",
family="semantic_definition",
confidence=0.9,
priority=max(0, 100 - index * 12),
)
key = (candidate["text"].lower(), candidate["family"])
if candidate["text"] and key not in seen:
seen.add(key)
candidates.append(candidate)
for index, gloss in enumerate(semantic.get("glosses", []) or []):
for piece in split_definition_text(str(gloss)):
candidate = build_candidate(
piece,
source="semantic_gloss",
family="semantic_gloss",
confidence=0.8,
priority=max(0, 90 - index * 10),
)
key = (candidate["text"].lower(), candidate["family"])
if candidate["text"] and key not in seen:
seen.add(key)
candidates.append(candidate)
for index, sense in enumerate(entry.get("senses", []) or []):
if not isinstance(sense, dict):
continue
for piece in split_definition_text(str(sense.get("definition", ""))):
source = str(sense.get("source", "refined"))
candidate = build_candidate(
piece,
source="refined" if source == "semantic" else source,
family="refined_sense",
confidence=float(sense.get("confidence", 0.7) or 0.7),
priority=max(0, 80 - index * 8),
)
key = (candidate["text"].lower(), candidate["family"])
if candidate["text"] and key not in seen:
seen.add(key)
candidates.append(candidate)
babelnet = entry.get("babelnet", {})
if isinstance(babelnet, dict):
best_synset = babelnet.get("best_synset", {})
if isinstance(best_synset, dict):
confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
for index, gloss in enumerate(best_synset.get("glosses", []) or []):
for piece in split_definition_text(str(gloss)):
candidate = build_candidate(
piece,
source="babelnet",
family="babelnet_gloss",
confidence=confidence,
priority=max(0, 60 - index * 8),
)
key = (candidate["text"].lower(), candidate["family"])
if candidate["text"] and key not in seen:
seen.add(key)
candidates.append(candidate)
wiktextract = entry.get("wiktextract", {})
if isinstance(wiktextract, dict):
definitions = wiktextract.get("definitions", []) or []
confidence = 0.78 if wiktextract.get("matched") else 0.45
for index, definition in enumerate(definitions):
for piece in split_definition_text(str(definition)):
candidate = build_candidate(
piece,
source="wiktextract",
family="wiktextract_definition",
confidence=confidence,
priority=max(0, 88 - index * 9),
)
key = (candidate["text"].lower(), candidate["family"])
if candidate["text"] and key not in seen:
seen.add(key)
candidates.append(candidate)
return candidates
def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
text = str(candidate["text"])
answer = str(entry.get("form", "")).lower()
score = 0
source = str(candidate.get("source"))
family = str(candidate.get("family"))
confidence = float(candidate.get("confidence", 0.0) or 0.0)
if len(text) < 12:
return -10_000
if definition_mentions_answer(text, answer):
score -= 140
else:
score += 30
if suspicious_proper_noun_definition(text, entry):
score -= 220
if likely_abstract_detour(text):
score -= 80
min_len, max_len = length_window(difficulty)
if min_len <= len(text) <= max_len:
score += 24
else:
score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2
source_bonus = {
"semantic": 55,
"semantic_gloss": 40,
"babelnet": 24,
"refined": 30,
"wiktextract": 52,
}
score += source_bonus.get(source, 10)
family_bonus = {
"semantic_definition": 30,
"semantic_gloss": 18,
"babelnet_gloss": 8,
"refined_sense": 22,
"wiktextract_definition": 28,
}
score += family_bonus.get(family, 0)
score += int(candidate.get("priority", 0) or 0)
score += int(confidence * 35)
alignment = topic_alignment_score(text, entry)
score += alignment
topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
if concrete_topics and alignment == 0:
score -= 45
style = str(candidate.get("style"))
if difficulty == "easy" and style == "direct":
score += 16
elif difficulty == "medium" and style in {"direct", "balanced"}:
score += 14
elif difficulty == "hard" and style == "balanced":
score += 10
elif difficulty == "expert" and style == "oblique":
score += 10
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
score -= 30
if difficulty in {"hard", "expert"} and ";" in text:
score -= 10
if entry.get("needs_review"):
score -= 8
return score
def choose_best_candidate(
candidates: Sequence[Dict[str, object]],
entry: Dict[str, object],
difficulty: str,
) -> Optional[Dict[str, object]]:
ranked = sorted(
candidates,
key=lambda candidate: (
score_candidate(candidate, entry, difficulty),
float(candidate.get("confidence", 0.0)),
float(candidate.get("priority", 0.0)),
-len(str(candidate.get("text", ""))),
),
reverse=True,
)
return ranked[0] if ranked else None
def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
reasons: List[str] = []
form = str(entry.get("form", ""))
lowered_topics = set(lexical_topics(entry))
semantic_topic_set = set(semantic_topics(entry))
babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
wiktextract = entry.get("wiktextract", {})
wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
preferred_definition = str(entry.get("preferred_definition", ""))
preferred_source = str(entry.get("preferred_source", ""))
if not candidates:
reasons.append("no_viable_definition")
if not preferred_definition and entry.get("needs_review"):
reasons.append("flagged_by_refined_stage")
if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
reasons.append("proper_noun_collision")
if babelnet_status == "ambiguous" and preferred_source == "babelnet":
reasons.append("babelnet_ambiguous")
if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
reasons.append("wiktextract_missing")
if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
reasons.append("only_general_topics")
if len(form) <= 2:
reasons.append("very_short_word")
if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
reasons.append("function_word")
if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
reasons.append("unresolved_sense_topics")
if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
reasons.append("candidate_mentions_answer")
return dedupe(reasons)
def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
curated = deepcopy(entry)
candidates = collect_candidates(curated)
clue_definitions: Dict[str, str] = {}
clue_sources: Dict[str, str] = {}
clue_scores: Dict[str, int] = {}
curation_notes: List[str] = []
for difficulty in DIFFICULTIES:
best = choose_best_candidate(candidates, curated, difficulty)
if best:
clue_definitions[difficulty] = str(best["text"])
clue_sources[difficulty] = str(best["source"])
clue_scores[difficulty] = score_candidate(best, curated, difficulty)
preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"
if preferred_definition:
curation_notes.append(f"preferred_from={preferred_source}")
if clue_scores.get("medium", -9999) < 20:
curation_notes.append("weak_medium_definition")
curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
curated["curated_senses"] = [
{
"definition": candidate["text"],
"source": candidate["source"],
"family": candidate["family"],
"confidence": candidate["confidence"],
"priority": candidate["priority"],
}
for candidate in candidates
]
curated["preferred_definition"] = preferred_definition
curated["preferred_source"] = preferred_source
curated["clue_definitions"] = clue_definitions
curated["clue_sources"] = clue_sources
curated["clue_scores"] = clue_scores
curated["curation_notes"] = curation_notes
reasons = review_reasons(curated, candidates)
severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
curated["alpha_ready"] = alpha_ready
curated["review_reasons"] = reasons
review_item = None
if reasons:
review_item = {
"form": curated.get("form"),
"lemma": curated.get("lemma"),
"pos": curated.get("pos"),
"topics": curated.get("topics"),
"topic_suggestions": curated.get("topic_suggestions"),
"preferred_definition": preferred_definition,
"preferred_source": preferred_source,
"clue_definitions": clue_definitions,
"review_reasons": reasons,
"semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
"senses": curated.get("senses", []),
"babelnet_status": (curated.get("babelnet") or {}).get("status"),
"babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
"wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
"wiktextract": curated.get("wiktextract"),
"candidate_pool": [
{
"text": candidate["text"],
"source": candidate["source"],
"family": candidate["family"],
"confidence": candidate["confidence"],
"priority": candidate["priority"],
}
for candidate in candidates[:12]
],
}
return curated, review_item
def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
payload = load_json(args.input)
if not isinstance(payload, dict) or "entries" not in payload:
raise ValueError(f"Lessico refined non valido: {args.input}")
curated_entries: List[Dict[str, object]] = []
review_entries: List[Dict[str, object]] = []
for entry in payload.get("entries", []) or []:
if not isinstance(entry, dict):
continue
curated, review_item = curate_entry(entry)
curated_entries.append(curated)
if review_item:
review_entries.append(review_item)
if args.max_review > 0:
review_entries = review_entries[: args.max_review]
curated_payload = {
"meta": {
"language": "it",
"version": 1,
"base_lexicon": args.input.name,
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(curated_entries),
"alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
"review_count": len(review_entries),
},
"entries": curated_entries,
}
review_payload = {
"meta": {
"language": "it",
"version": 1,
"base_lexicon": args.input.name,
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(review_entries),
},
"entries": review_entries,
}
return curated_payload, review_payload
def main() -> None:
args = parse_args()
curated_payload, review_payload = build_curated_lexicon(args)
write_json(args.output, curated_payload)
write_json(args.review_output, review_payload)
print(f"Lessico curated generato: {args.output}")
print(f"Voci totali: {curated_payload['meta']['entry_count']}")
print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
print(f"File review generato: {args.review_output}")
if __name__ == "__main__":
main()