alpha01 backoffice: crossword engine, lexicon curation and JSON contract

2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions
--- a/refine_lexicon_topics.py
+++ b/refine_lexicon_topics.py
@@ -0,0 +1,473 @@
+from __future__ import annotations
+
+import argparse
+import json
+import re
+from copy import deepcopy
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
+
+
+REFINED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined.json")
+
+TOPIC_KEYWORDS: Dict[str, Tuple[str, ...]] = {
+    "religion": (
+        "abbazia",
+        "abate",
+        "arcivescovo",
+        "cappella",
+        "cardinale",
+        "chiesa",
+        "clero",
+        "convento",
+        "diocesi",
+        "ecclesiast",
+        "fede",
+        "frate",
+        "mistica",
+        "monaco",
+        "monastero",
+        "parrocchia",
+        "prete",
+        "religion",
+        "sacerdot",
+        "santo",
+        "vescovo",
+    ),
+    "ecclesiastical_hierarchy": (
+        "abate",
+        "arcivescovo",
+        "carica ecclesiastica",
+        "cardinale",
+        "clero",
+        "dignità ecclesiastica",
+        "ecclesiast",
+        "ordinazione",
+        "parroco",
+        "patriarca",
+        "pontefice",
+        "prete",
+        "priore",
+        "superiore del monastero",
+        "vescovo",
+    ),
+    "honorific_title": (
+        "carica",
+        "epiteto",
+        "nobile",
+        "onore",
+        "onorific",
+        "titolo",
+    ),
+    "mysticism": (
+        "asceta",
+        "contemplazione",
+        "estasi",
+        "mistica",
+        "mistico",
+        "monachesimo",
+        "spiritual",
+    ),
+    "geography": (
+        "borgo",
+        "città",
+        "comune",
+        "frazione",
+        "geografia",
+        "isola",
+        "località",
+        "paese",
+        "provincia",
+        "regione",
+        "stato",
+        "toponimo",
+        "valle",
+    ),
+    "transport": (
+        "aereo",
+        "aeroplano",
+        "auto",
+        "autobus",
+        "autocarro",
+        "barca",
+        "bicicletta",
+        "imbarcazione",
+        "locomotiva",
+        "motore",
+        "nave",
+        "pista",
+        "porto",
+        "stazione",
+        "traghetto",
+        "treno",
+        "trasport",
+        "veicolo",
+        "viaggio",
+    ),
+    "nature": (
+        "acqua",
+        "albero",
+        "animale",
+        "bosco",
+        "fiore",
+        "fiume",
+        "foresta",
+        "mare",
+        "montagna",
+        "natura",
+        "pianta",
+        "terra",
+    ),
+    "health": (
+        "ambulanza",
+        "anemia",
+        "cura",
+        "farmaco",
+        "malattia",
+        "medic",
+        "ospedale",
+        "paziente",
+        "salute",
+        "soccorso",
+        "terapia",
+    ),
+    "war": (
+        "arma",
+        "artiglieria",
+        "assalto",
+        "battaglia",
+        "bombard",
+        "esercito",
+        "fortezza",
+        "guerra",
+        "militare",
+        "soldato",
+        "trincea",
+    ),
+}
+
+TAG_STOPWORDS = {
+    "and",
+    "con",
+    "da",
+    "dei",
+    "del",
+    "della",
+    "delle",
+    "dello",
+    "di",
+    "e",
+    "il",
+    "in",
+    "la",
+    "le",
+    "lo",
+    "nel",
+    "nella",
+    "per",
+    "su",
+    "the",
+    "un",
+    "una",
+}
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Genera un lessico raffinato con campi aggiuntivi per topic, tag semantici e sensi."
+    )
+    parser.add_argument(
+        "--input",
+        type=Path,
+        default=ENRICHED_LEXICON_OUTPUT_PATH,
+        help="File lessicale di partenza, tipicamente lexicon_it_enriched.json.",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=REFINED_LEXICON_OUTPUT_PATH,
+        help="Nuovo file lessicale raffinato da generare.",
+    )
+    parser.add_argument(
+        "--replace-general",
+        action="store_true",
+        help="Se attivo, sostituisce topic=['general'] con i topic suggeriti quando la confidenza e alta.",
+    )
+    parser.add_argument(
+        "--min-topic-score",
+        type=int,
+        default=40,
+        help="Punteggio minimo per promuovere un topic suggerito nei topics finali.",
+    )
+    return parser.parse_args()
+
+
+def load_json(path: Path) -> Dict[str, object]:
+    return json.loads(path.read_text(encoding="utf-8"))
+
+
+def write_json(path: Path, payload: Dict[str, object]) -> None:
+    path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def dedupe(items: Iterable[str]) -> List[str]:
+    result: List[str] = []
+    seen = set()
+    for item in items:
+        text = str(item).strip()
+        if not text:
+            continue
+        key = text.lower()
+        if key in seen:
+            continue
+        seen.add(key)
+        result.append(text)
+    return result
+
+
+def slugify_tag(text: str) -> str:
+    value = re.sub(r"[^a-z0-9]+", "_", text.strip().lower(), flags=re.IGNORECASE)
+    value = value.strip("_")
+    return value
+
+
+def cleanup_tag(tag: str) -> str:
+    normalized = slugify_tag(tag)
+    if not normalized or normalized in TAG_STOPWORDS or len(normalized) <= 1:
+        return ""
+    return normalized
+
+
+def flatten_text(entry: Dict[str, object]) -> str:
+    chunks: List[str] = []
+    chunks.extend(str(topic) for topic in entry.get("topics", []) or [])
+
+    semantic = entry.get("semantic", {})
+    if isinstance(semantic, dict):
+        chunks.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
+        chunks.extend(str(gloss) for gloss in semantic.get("glosses", []) or [])
+        for synset in semantic.get("synsets", []) or []:
+            if isinstance(synset, dict):
+                chunks.append(str(synset.get("definition", "")))
+                chunks.extend(str(item) for item in synset.get("lemmas", []) or [])
+
+    babelnet = entry.get("babelnet", {})
+    if isinstance(babelnet, dict):
+        chunks.extend(str(item) for item in babelnet.get("synset_refs", []) or [])
+        best_synset = babelnet.get("best_synset", {})
+        if isinstance(best_synset, dict):
+            chunks.extend(str(item) for item in best_synset.get("glosses", []) or [])
+            chunks.extend(str(item) for item in best_synset.get("categories", []) or [])
+            chunks.extend(str(item) for item in best_synset.get("domains", []) or [])
+            chunks.extend(str(item) for item in best_synset.get("senses", []) or [])
+        for synset in babelnet.get("synsets", []) or []:
+            if isinstance(synset, dict):
+                chunks.extend(str(item) for item in synset.get("glosses", []) or [])
+                chunks.extend(str(item) for item in synset.get("categories", []) or [])
+                chunks.extend(str(item) for item in synset.get("domains", []) or [])
+                chunks.extend(str(item) for item in synset.get("senses", []) or [])
+
+    return " ".join(chunks).lower()
+
+
+def infer_topic_scores(entry: Dict[str, object]) -> Dict[str, int]:
+    text = flatten_text(entry)
+    scores: Dict[str, int] = {}
+    for topic, keywords in TOPIC_KEYWORDS.items():
+        score = 0
+        for keyword in keywords:
+            occurrences = text.count(keyword.lower())
+            if occurrences:
+                score += 12 * occurrences
+        if score:
+            scores[topic] = min(score, 100)
+    return scores
+
+
+def collect_semantic_tags(entry: Dict[str, object]) -> List[str]:
+    tags: List[str] = []
+    tags.extend(str(topic) for topic in entry.get("topics", []) or [])
+
+    semantic = entry.get("semantic", {})
+    if isinstance(semantic, dict):
+        tags.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
+        for relation_group in (semantic.get("raw_relation_terms", {}) or {}).values():
+            tags.extend(str(item) for item in relation_group or [])
+
+    babelnet = entry.get("babelnet", {})
+    if isinstance(babelnet, dict):
+        best_synset = babelnet.get("best_synset", {})
+        if isinstance(best_synset, dict):
+            tags.extend(str(item) for item in best_synset.get("categories", []) or [])
+            tags.extend(str(item) for item in best_synset.get("domains", []) or [])
+        for synset in babelnet.get("synsets", []) or []:
+            if isinstance(synset, dict):
+                tags.extend(str(item) for item in synset.get("categories", []) or [])
+                tags.extend(str(item) for item in synset.get("domains", []) or [])
+
+    cleaned = [cleanup_tag(tag) for tag in tags]
+    return [tag for tag in dedupe(cleaned) if tag]
+
+
+def collect_senses(entry: Dict[str, object], topic_scores: Dict[str, int]) -> List[Dict[str, object]]:
+    senses: List[Dict[str, object]] = []
+
+    semantic = entry.get("semantic", {})
+    if isinstance(semantic, dict):
+        for synset in semantic.get("synsets", []) or []:
+            if not isinstance(synset, dict):
+                continue
+            definition = str(synset.get("definition", "")).strip()
+            if not definition:
+                continue
+            senses.append(
+                {
+                    "source": "semantic",
+                    "id": synset.get("id"),
+                    "definition": definition,
+                    "lemmas": dedupe(str(item) for item in synset.get("lemmas", []) or []),
+                    "topics": dedupe(
+                        list(semantic.get("semantic_topics", []) or [])
+                        + [topic for topic, score in topic_scores.items() if score >= 50]
+                    ),
+                    "confidence": 0.7,
+                }
+            )
+
+    babelnet = entry.get("babelnet", {})
+    if isinstance(babelnet, dict):
+        best_synset = babelnet.get("best_synset", {})
+        if isinstance(best_synset, dict) and best_synset.get("id"):
+            glosses = [str(item).strip() for item in best_synset.get("glosses", []) or [] if str(item).strip()]
+            if glosses:
+                senses.append(
+                    {
+                        "source": "babelnet",
+                        "id": best_synset.get("id"),
+                        "definition": glosses[0],
+                        "lemmas": dedupe(str(item) for item in best_synset.get("senses", []) or []),
+                        "topics": dedupe(
+                            [str(best_synset.get("topic", "")).strip()]
+                            + [topic for topic, score in topic_scores.items() if score >= 50]
+                        ),
+                        "confidence": round(min(max(float(best_synset.get("topic_score", 0)) / 100.0, 0.4), 0.95), 2),
+                    }
+                )
+
+    return senses
+
+
+def collect_geo_tags(entry: Dict[str, object]) -> List[str]:
+    babelnet = entry.get("babelnet", {})
+    tags: List[str] = []
+    if isinstance(babelnet, dict):
+        for synset in babelnet.get("synsets", []) or []:
+            if not isinstance(synset, dict):
+                continue
+            for category in synset.get("categories", []) or []:
+                text = str(category).lower()
+                if any(keyword in text for keyword in ("comuni_", "province_", "regioni_", "città", "paesi", "località")):
+                    tags.append("toponym_possible")
+    return dedupe(tags)
+
+
+def collect_name_tags(entry: Dict[str, object]) -> List[str]:
+    tags: List[str] = []
+    form = str(entry.get("form", ""))
+    if form[:1].isupper():
+        tags.append("capitalized_form")
+    return dedupe(tags)
+
+
+def should_review(entry: Dict[str, object], topic_scores: Dict[str, int], senses: List[Dict[str, object]]) -> bool:
+    existing_topics = [str(topic).lower() for topic in entry.get("topics", []) or []]
+    best_score = max(topic_scores.values(), default=0)
+    strong_topics = [topic for topic, score in topic_scores.items() if score >= 50]
+    babelnet_status = str((entry.get("babelnet", {}) or {}).get("status", ""))
+
+    if existing_topics == ["general"] and not strong_topics:
+        return True
+    if babelnet_status == "ambiguous" and best_score < 50:
+        return True
+    if len(senses) >= 3 and len(strong_topics) >= 2:
+        return True
+    return False
+
+
+def promoted_topics(
+    existing_topics: List[str], topic_scores: Dict[str, int], replace_general: bool, min_topic_score: int
+) -> List[str]:
+    inferred = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0])) if score >= min_topic_score]
+    existing_clean = dedupe(existing_topics)
+
+    if replace_general and existing_clean == ["general"] and inferred:
+        return inferred
+
+    return dedupe(existing_clean + inferred)
+
+
+def refine_entry(entry: Dict[str, object], replace_general: bool, min_topic_score: int) -> Dict[str, object]:
+    refined = deepcopy(entry)
+    topic_scores = infer_topic_scores(refined)
+    semantic_tags = collect_semantic_tags(refined)
+    senses = collect_senses(refined, topic_scores)
+    geo_tags = collect_geo_tags(refined)
+    name_tags = collect_name_tags(refined)
+    current_topics = [str(topic) for topic in refined.get("topics", []) or []]
+
+    refined["topics"] = promoted_topics(current_topics, topic_scores, replace_general, min_topic_score)
+    refined["semantic_tags"] = semantic_tags
+    refined["senses"] = senses
+    refined["topic_confidence"] = topic_scores
+    refined["topic_suggestions"] = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0]))]
+    refined["geo_tags"] = geo_tags
+    refined["name_tags"] = name_tags
+    refined["needs_review"] = should_review(refined, topic_scores, senses)
+    return refined
+
+
+def build_refined_lexicon(args: argparse.Namespace) -> Dict[str, object]:
+    payload = load_json(args.input)
+    if not isinstance(payload, dict) or "entries" not in payload:
+        raise ValueError(f"Lessico di input non valido: {args.input}")
+
+    refined_entries = [
+        refine_entry(entry, args.replace_general, args.min_topic_score)
+        for entry in payload.get("entries", []) or []
+        if isinstance(entry, dict)
+    ]
+
+    review_count = sum(1 for entry in refined_entries if entry.get("needs_review"))
+    topicful_count = sum(1 for entry in refined_entries if len(entry.get("topic_suggestions", []) or []) > 0)
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": args.input.name,
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(refined_entries),
+            "replace_general": args.replace_general,
+            "min_topic_score": args.min_topic_score,
+            "review_count": review_count,
+            "topicful_count": topicful_count,
+        },
+        "entries": refined_entries,
+    }
+
+
+def main() -> None:
+    args = parse_args()
+    payload = build_refined_lexicon(args)
+    write_json(args.output, payload)
+    print(f"Lessico raffinato generato: {args.output}")
+    print(f"Voci totali: {payload['meta']['entry_count']}")
+    print(f"Voci con suggerimenti di topic: {payload['meta']['topicful_count']}")
+    print(f"Voci marcate needs_review: {payload['meta']['review_count']}")
+
+
+if __name__ == "__main__":
+    main()