feat: collega il lessico semantico al filler

2026-04-14 18:56:17 +02:00
parent 77c7e709b6
commit b172b9c04b
15 changed files with 2255563 additions and 9 deletions
--- a/build_semantic_lexicon.py
+++ b/build_semantic_lexicon.py
@@ -0,0 +1,426 @@
+from __future__ import annotations
+
+import json
+import re
+import unicodedata
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Iterable, List, Tuple
+
+from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
+
+
+IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
+SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
+
+IWN_POS_MAP = {
+    "n": "NOUN",
+    "v": "VERB",
+    "a": "ADJ",
+    "s": "ADJ",
+    "r": "ADV",
+}
+
+SEMANTIC_TOPIC_KEYWORDS = {
+    "animals": {
+        "animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
+        "rettile", "domestico", "compagnia", "caccia", "pastorizia",
+    },
+    "plants": {
+        "pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
+        "ghianda", "bosco", "radice", "seme", "vegetale",
+    },
+    "nature": {
+        "natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
+        "lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
+    },
+    "ecology": {
+        "ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
+        "ecosistema", "acqua", "terra",
+    },
+    "geography": {
+        "territorio", "superficie", "terrestre", "regione", "confine", "montagna",
+        "isola", "mare", "lago", "fiume",
+    },
+    "weather": {
+        "clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
+        "atmosfera",
+    },
+    "sea": {
+        "mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
+        "nave", "fondale",
+    },
+    "mountain": {
+        "montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
+    },
+    "health": {
+        "salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
+        "terapia", "cervello", "respiro",
+    },
+    "science": {
+        "scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
+        "chimica", "biologia", "strumento",
+    },
+    "sport": {
+        "sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
+        "agonistico",
+    },
+    "history": {
+        "storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
+    },
+    "school": {
+        "scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
+        "quaderno", "aula",
+    },
+    "cinema": {
+        "film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
+        "teatro",
+    },
+    "literature": {
+        "libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
+        "letteratura",
+    },
+    "food": {
+        "cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
+        "gelato",
+    },
+    "city": {
+        "citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
+        "quartiere",
+    },
+    "transport": {
+        "veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
+        "aereo", "automobile",
+    },
+    "work": {
+        "lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
+    },
+    "home": {
+        "casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
+        "tavolo", "sedia",
+    },
+}
+
+
+def normalize_word(text: str) -> str:
+    normalized = unicodedata.normalize("NFKD", text)
+    ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
+    ascii_only = ascii_only.lower().replace("_", "")
+    ascii_only = re.sub(r"[^a-z]", "", ascii_only)
+    return ascii_only
+
+
+def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
+    tokens = set()
+    for part in parts:
+        normalized = normalize_word(part)
+        if not normalized:
+            continue
+        tokens.add(normalized)
+        tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
+
+    topics = set()
+    for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
+        if any(keyword in tokens for keyword in keywords):
+            topics.add(topic)
+    return sorted(topics)
+
+
+def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
+    xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
+    xml_text = xml_text.replace('"  -->', '">')
+    root = ET.fromstring(xml_text)
+
+    synsets: Dict[str, Dict[str, object]] = {}
+    entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
+
+    for synset in root.findall(".//Synset"):
+        synset_id = synset.attrib.get("id", "")
+        relations = [
+            {
+                "type": relation.attrib.get("relType", ""),
+                "target": relation.attrib.get("target", ""),
+                "subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
+            }
+            for relation in synset.findall("SynsetRelation")
+        ]
+        synsets[synset_id] = {
+            "id": synset_id,
+            "ili": synset.attrib.get("ili", ""),
+            "definition": (synset.findtext("Definition") or "").strip(),
+            "relations": relations,
+            "lemmas": [],
+            "pos": "",
+        }
+
+    for lexical_entry in root.findall(".//LexicalEntry"):
+        lemma = lexical_entry.find("Lemma")
+        if lemma is None:
+            continue
+
+        written_form = lemma.attrib.get("writtenForm", "").strip()
+        normalized_form = normalize_word(written_form)
+        if not normalized_form:
+            continue
+
+        pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
+        senses = lexical_entry.findall("Sense")
+        sense_payloads = []
+
+        for sense in senses:
+            synset_id = sense.attrib.get("synset", "")
+            sense_id = sense.attrib.get("id", "")
+            if not synset_id or synset_id not in synsets:
+                continue
+
+            synsets[synset_id]["lemmas"].append(written_form)
+            synsets[synset_id]["pos"] = pos
+            sense_payloads.append(
+                {
+                    "sense_id": sense_id,
+                    "synset_id": synset_id,
+                    "pos": pos,
+                }
+            )
+
+        if sense_payloads:
+            entries_by_norm[normalized_form].append(
+                {
+                    "written_form": written_form,
+                    "normalized_form": normalized_form,
+                    "pos": pos,
+                    "senses": sense_payloads,
+                }
+            )
+
+    for synset in synsets.values():
+        unique_lemmas = []
+        seen = set()
+        for lemma in synset["lemmas"]:
+            if lemma not in seen:
+                seen.add(lemma)
+                unique_lemmas.append(lemma)
+        synset["lemmas"] = unique_lemmas
+
+    return synsets, entries_by_norm
+
+
+def score_sense(
+    synset_id: str,
+    current_topics: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> int:
+    synset = synsets.get(synset_id, {})
+    definition = str(synset.get("definition", ""))
+    inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
+    current_topics_set = set(str(topic) for topic in current_topics)
+    overlap = len(inferred_topics & current_topics_set)
+    return overlap * 10 + len(definition)
+
+
+def best_candidate(
+    candidates: List[Dict[str, object]],
+    expected_pos: str,
+    current_topics: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> Dict[str, object]:
+    ranked = []
+    for candidate in candidates:
+        pos_bonus = 100 if candidate["pos"] == expected_pos else 0
+        sense_bonus = 0
+        for sense in candidate.get("senses", []):
+            sense_bonus = max(
+                sense_bonus,
+                score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
+            )
+        ranked.append((pos_bonus + sense_bonus, candidate))
+    ranked.sort(key=lambda item: item[0], reverse=True)
+    return ranked[0][1]
+
+
+def dedupe_keep_order(items: Iterable[str]) -> List[str]:
+    seen = set()
+    result = []
+    for item in items:
+        text = str(item).strip()
+        if not text or text in seen:
+            continue
+        seen.add(text)
+        result.append(text)
+    return result
+
+
+def resolve_related_lemmas(
+    synset_ids: Iterable[str],
+    relation_type: str,
+    synsets: Dict[str, Dict[str, object]],
+) -> List[str]:
+    related = []
+    for synset_id in synset_ids:
+        synset = synsets.get(synset_id)
+        if not synset:
+            continue
+        for relation in synset.get("relations", []):
+            if relation.get("type") != relation_type:
+                continue
+            target = relation.get("target", "")
+            target_synset = synsets.get(target)
+            if not target_synset:
+                continue
+            related.extend(target_synset.get("lemmas", []))
+    return dedupe_keep_order(related)
+
+
+def collect_relation_terms(
+    synset_ids: Iterable[str],
+    relation_types: Iterable[str],
+    synsets: Dict[str, Dict[str, object]],
+) -> Dict[str, List[str]]:
+    return {
+        relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
+        for relation_type in relation_types
+    }
+
+
+def enrich_entry(
+    entry: Dict[str, object],
+    synsets: Dict[str, Dict[str, object]],
+    entries_by_norm: Dict[str, List[Dict[str, object]]],
+) -> Dict[str, object]:
+    normalized_candidates = dedupe_keep_order(
+        [
+            normalize_word(str(entry.get("form", ""))),
+            normalize_word(str(entry.get("lemma", ""))),
+            normalize_word(str(entry.get("normalized_form", ""))),
+        ]
+    )
+    matches: List[Dict[str, object]] = []
+    for candidate_key in normalized_candidates:
+        matches.extend(entries_by_norm.get(candidate_key, []))
+
+    if not matches:
+        entry["semantic"] = {
+            "source": "iwn-omw",
+            "matched": False,
+            "match_count": 0,
+            "synsets": [],
+            "synonyms": [],
+            "raw_relation_terms": {},
+            "glosses": [],
+            "semantic_topics": [],
+        }
+        return entry
+
+    selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
+    sorted_senses = sorted(
+        selected.get("senses", []),
+        key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
+        reverse=True,
+    )
+    synset_ids = [sense["synset_id"] for sense in sorted_senses]
+    synset_payloads = []
+    glosses = []
+    synonyms = []
+    for synset_id in synset_ids[:3]:
+        synset = synsets.get(synset_id)
+        if not synset:
+            continue
+        gloss = str(synset.get("definition", "")).strip()
+        glosses.append(gloss)
+        synset_payloads.append(
+            {
+                "id": synset_id,
+                "pos": synset.get("pos", ""),
+                "definition": gloss,
+                "lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
+                "raw_relation_terms": collect_relation_terms(
+                    [synset_id],
+                    ("hypernym", "hyponym", "similar"),
+                    synsets,
+                ),
+            }
+        )
+        synonyms.extend(synset.get("lemmas", []))
+
+    raw_relation_terms = collect_relation_terms(
+        synset_ids,
+        ("hypernym", "hyponym", "similar"),
+        synsets,
+    )
+    synonyms = [
+        lemma
+        for lemma in dedupe_keep_order(synonyms)
+        if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
+    ][:20]
+    glosses = dedupe_keep_order(glosses)
+    semantic_topics = dedupe_keep_order(
+        list(entry.get("topics", []))
+        + semantic_topics_from_text(
+            glosses
+            + synonyms
+            + raw_relation_terms.get("hypernym", [])
+            + raw_relation_terms.get("hyponym", [])
+            + raw_relation_terms.get("similar", [])
+        )
+    )
+    entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
+    entry["semantic"] = {
+        "source": "iwn-omw",
+        "matched": True,
+        "match_count": len(matches),
+        "selected_form": selected.get("written_form", ""),
+        "synsets": synset_payloads,
+        "synonyms": synonyms,
+        "raw_relation_terms": raw_relation_terms,
+        "glosses": glosses,
+        "semantic_topics": semantic_topics,
+    }
+    return entry
+
+
+def build_semantic_lexicon() -> Dict[str, object]:
+    if not LEXICON_OUTPUT_PATH.exists():
+        raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
+    if not IWN_XML_PATH.exists():
+        raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
+
+    payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
+    synsets, entries_by_norm = parse_iwn()
+
+    enriched_entries = []
+    for entry in payload.get("entries", []):
+        enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
+
+    return {
+        "meta": {
+            "language": "it",
+            "version": 1,
+            "base_lexicon": str(LEXICON_OUTPUT_PATH.name),
+            "sources": [
+                "lexicon_it.json",
+                "iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
+            ],
+            "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
+            "entry_count": len(enriched_entries),
+            "semantic_source": "IWN-OMW v1.0",
+        },
+        "entries": enriched_entries,
+    }
+
+
+def main() -> None:
+    payload = build_semantic_lexicon()
+    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
+        json.dumps(payload, ensure_ascii=False, indent=2),
+        encoding="utf-8",
+    )
+    matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
+    print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
+    print(f"Voci totali: {payload['meta']['entry_count']}")
+    print(f"Voci con match semantico: {matched}")
+
+
+if __name__ == "__main__":
+    main()