cruciverba_1/build_semantic_lexicon.py

from __future__ import annotations

import json
import re
import unicodedata
import xml.etree.ElementTree as ET
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Tuple

from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics


IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")

IWN_POS_MAP = {
    "n": "NOUN",
    "v": "VERB",
    "a": "ADJ",
    "s": "ADJ",
    "r": "ADV",
}

SEMANTIC_TOPIC_KEYWORDS = {
    "animals": {
        "animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
        "rettile", "domestico", "compagnia", "caccia", "pastorizia",
    },
    "plants": {
        "pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
        "ghianda", "bosco", "radice", "seme", "vegetale",
    },
    "nature": {
        "natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
        "lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
    },
    "ecology": {
        "ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
        "ecosistema", "acqua", "terra",
    },
    "geography": {
        "territorio", "superficie", "terrestre", "regione", "confine", "montagna",
        "isola", "mare", "lago", "fiume",
    },
    "weather": {
        "clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
        "atmosfera",
    },
    "sea": {
        "mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
        "nave", "fondale",
    },
    "mountain": {
        "montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
    },
    "health": {
        "salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
        "terapia", "cervello", "respiro",
    },
    "science": {
        "scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
        "chimica", "biologia", "strumento",
    },
    "sport": {
        "sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
        "agonistico",
    },
    "history": {
        "storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
    },
    "school": {
        "scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
        "quaderno", "aula",
    },
    "cinema": {
        "film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
        "teatro",
    },
    "literature": {
        "libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
        "letteratura",
    },
    "food": {
        "cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
        "gelato",
    },
    "city": {
        "citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
        "quartiere",
    },
    "transport": {
        "veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
        "aereo", "automobile",
    },
    "work": {
        "lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
    },
    "home": {
        "casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
        "tavolo", "sedia",
    },
}


def normalize_word(text: str) -> str:
    normalized = unicodedata.normalize("NFKD", text)
    ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
    ascii_only = ascii_only.lower().replace("_", "")
    ascii_only = re.sub(r"[^a-z]", "", ascii_only)
    return ascii_only


def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
    tokens = set()
    for part in parts:
        normalized = normalize_word(part)
        if not normalized:
            continue
        tokens.add(normalized)
        tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))

    topics = set()
    for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
        if any(keyword in tokens for keyword in keywords):
            topics.add(topic)
    return sorted(topics)


def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
    xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
    xml_text = xml_text.replace('"  -->', '">')
    root = ET.fromstring(xml_text)

    synsets: Dict[str, Dict[str, object]] = {}
    entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)

    for synset in root.findall(".//Synset"):
        synset_id = synset.attrib.get("id", "")
        relations = [
            {
                "type": relation.attrib.get("relType", ""),
                "target": relation.attrib.get("target", ""),
                "subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
            }
            for relation in synset.findall("SynsetRelation")
        ]
        synsets[synset_id] = {
            "id": synset_id,
            "ili": synset.attrib.get("ili", ""),
            "definition": (synset.findtext("Definition") or "").strip(),
            "relations": relations,
            "lemmas": [],
            "pos": "",
        }

    for lexical_entry in root.findall(".//LexicalEntry"):
        lemma = lexical_entry.find("Lemma")
        if lemma is None:
            continue

        written_form = lemma.attrib.get("writtenForm", "").strip()
        normalized_form = normalize_word(written_form)
        if not normalized_form:
            continue

        pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
        senses = lexical_entry.findall("Sense")
        sense_payloads = []

        for sense in senses:
            synset_id = sense.attrib.get("synset", "")
            sense_id = sense.attrib.get("id", "")
            if not synset_id or synset_id not in synsets:
                continue

            synsets[synset_id]["lemmas"].append(written_form)
            synsets[synset_id]["pos"] = pos
            sense_payloads.append(
                {
                    "sense_id": sense_id,
                    "synset_id": synset_id,
                    "pos": pos,
                }
            )

        if sense_payloads:
            entries_by_norm[normalized_form].append(
                {
                    "written_form": written_form,
                    "normalized_form": normalized_form,
                    "pos": pos,
                    "senses": sense_payloads,
                }
            )

    for synset in synsets.values():
        unique_lemmas = []
        seen = set()
        for lemma in synset["lemmas"]:
            if lemma not in seen:
                seen.add(lemma)
                unique_lemmas.append(lemma)
        synset["lemmas"] = unique_lemmas

    return synsets, entries_by_norm


def score_sense(
    synset_id: str,
    current_topics: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
) -> int:
    synset = synsets.get(synset_id, {})
    definition = str(synset.get("definition", ""))
    inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
    current_topics_set = set(str(topic) for topic in current_topics)
    overlap = len(inferred_topics & current_topics_set)
    return overlap * 10 + len(definition)


def best_candidate(
    candidates: List[Dict[str, object]],
    expected_pos: str,
    current_topics: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
) -> Dict[str, object]:
    ranked = []
    for candidate in candidates:
        pos_bonus = 100 if candidate["pos"] == expected_pos else 0
        sense_bonus = 0
        for sense in candidate.get("senses", []):
            sense_bonus = max(
                sense_bonus,
                score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
            )
        ranked.append((pos_bonus + sense_bonus, candidate))
    ranked.sort(key=lambda item: item[0], reverse=True)
    return ranked[0][1]


def dedupe_keep_order(items: Iterable[str]) -> List[str]:
    seen = set()
    result = []
    for item in items:
        text = str(item).strip()
        if not text or text in seen:
            continue
        seen.add(text)
        result.append(text)
    return result


def resolve_related_lemmas(
    synset_ids: Iterable[str],
    relation_type: str,
    synsets: Dict[str, Dict[str, object]],
) -> List[str]:
    related = []
    for synset_id in synset_ids:
        synset = synsets.get(synset_id)
        if not synset:
            continue
        for relation in synset.get("relations", []):
            if relation.get("type") != relation_type:
                continue
            target = relation.get("target", "")
            target_synset = synsets.get(target)
            if not target_synset:
                continue
            related.extend(target_synset.get("lemmas", []))
    return dedupe_keep_order(related)


def collect_relation_terms(
    synset_ids: Iterable[str],
    relation_types: Iterable[str],
    synsets: Dict[str, Dict[str, object]],
) -> Dict[str, List[str]]:
    return {
        relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
        for relation_type in relation_types
    }


def enrich_entry(
    entry: Dict[str, object],
    synsets: Dict[str, Dict[str, object]],
    entries_by_norm: Dict[str, List[Dict[str, object]]],
) -> Dict[str, object]:
    normalized_candidates = dedupe_keep_order(
        [
            normalize_word(str(entry.get("form", ""))),
            normalize_word(str(entry.get("lemma", ""))),
            normalize_word(str(entry.get("normalized_form", ""))),
        ]
    )
    matches: List[Dict[str, object]] = []
    for candidate_key in normalized_candidates:
        matches.extend(entries_by_norm.get(candidate_key, []))

    if not matches:
        entry["semantic"] = {
            "source": "iwn-omw",
            "matched": False,
            "match_count": 0,
            "synsets": [],
            "synonyms": [],
            "raw_relation_terms": {},
            "glosses": [],
            "semantic_topics": [],
        }
        return entry

    selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
    sorted_senses = sorted(
        selected.get("senses", []),
        key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
        reverse=True,
    )
    synset_ids = [sense["synset_id"] for sense in sorted_senses]
    synset_payloads = []
    glosses = []
    synonyms = []
    for synset_id in synset_ids[:3]:
        synset = synsets.get(synset_id)
        if not synset:
            continue
        gloss = str(synset.get("definition", "")).strip()
        glosses.append(gloss)
        synset_payloads.append(
            {
                "id": synset_id,
                "pos": synset.get("pos", ""),
                "definition": gloss,
                "lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
                "raw_relation_terms": collect_relation_terms(
                    [synset_id],
                    ("hypernym", "hyponym", "similar"),
                    synsets,
                ),
            }
        )
        synonyms.extend(synset.get("lemmas", []))

    raw_relation_terms = collect_relation_terms(
        synset_ids,
        ("hypernym", "hyponym", "similar"),
        synsets,
    )
    synonyms = [
        lemma
        for lemma in dedupe_keep_order(synonyms)
        if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
    ][:20]
    glosses = dedupe_keep_order(glosses)
    semantic_topics = dedupe_keep_order(
        list(entry.get("topics", []))
        + semantic_topics_from_text(
            glosses
            + synonyms
            + raw_relation_terms.get("hypernym", [])
            + raw_relation_terms.get("hyponym", [])
            + raw_relation_terms.get("similar", [])
        )
    )
    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
    entry["semantic"] = {
        "source": "iwn-omw",
        "matched": True,
        "match_count": len(matches),
        "selected_form": selected.get("written_form", ""),
        "synsets": synset_payloads,
        "synonyms": synonyms,
        "raw_relation_terms": raw_relation_terms,
        "glosses": glosses,
        "semantic_topics": semantic_topics,
    }
    return entry


def build_semantic_lexicon() -> Dict[str, object]:
    if not LEXICON_OUTPUT_PATH.exists():
        raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
    if not IWN_XML_PATH.exists():
        raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")

    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
    synsets, entries_by_norm = parse_iwn()

    enriched_entries = []
    for entry in payload.get("entries", []):
        enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))

    return {
        "meta": {
            "language": "it",
            "version": 1,
            "base_lexicon": str(LEXICON_OUTPUT_PATH.name),
            "sources": [
                "lexicon_it.json",
                "iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
            ],
            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
            "entry_count": len(enriched_entries),
            "semantic_source": "IWN-OMW v1.0",
        },
        "entries": enriched_entries,
    }


def main() -> None:
    payload = build_semantic_lexicon()
    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
    print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
    print(f"Voci totali: {payload['meta']['entry_count']}")
    print(f"Voci con match semantico: {matched}")


if __name__ == "__main__":
    main()