from __future__ import annotations import json import re import unicodedata import xml.etree.ElementTree as ET from collections import defaultdict from datetime import datetime from pathlib import Path from typing import Dict, Iterable, List, Tuple from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml" SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json") IWN_POS_MAP = { "n": "NOUN", "v": "VERB", "a": "ADJ", "s": "ADJ", "r": "ADV", } SEMANTIC_TOPIC_KEYWORDS = { "animals": { "animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce", "rettile", "domestico", "compagnia", "caccia", "pastorizia", }, "plants": { "pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto", "ghianda", "bosco", "radice", "seme", "vegetale", }, "nature": { "natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco", "lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre", }, "ecology": { "ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde", "ecosistema", "acqua", "terra", }, "geography": { "territorio", "superficie", "terrestre", "regione", "confine", "montagna", "isola", "mare", "lago", "fiume", }, "weather": { "clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina", "atmosfera", }, "sea": { "mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela", "nave", "fondale", }, "mountain": { "montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero", }, "health": { "salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia", "terapia", "cervello", "respiro", }, "science": { "scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica", "chimica", "biologia", "strumento", }, "sport": { "sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco", "agonistico", }, "history": { "storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca", }, "school": { "scuola", "lezione", "studente", "classe", "maestro", "esame", "libro", "quaderno", "aula", }, "cinema": { "film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo", "teatro", }, "literature": { "libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo", "letteratura", }, "food": { "cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina", "gelato", }, "city": { "citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta", "quartiere", }, "transport": { "veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave", "aereo", "automobile", }, "work": { "lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio", }, "home": { "casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone", "tavolo", "sedia", }, } def normalize_word(text: str) -> str: normalized = unicodedata.normalize("NFKD", text) ascii_only = normalized.encode("ascii", "ignore").decode("ascii") ascii_only = ascii_only.lower().replace("_", "") ascii_only = re.sub(r"[^a-z]", "", ascii_only) return ascii_only def semantic_topics_from_text(parts: Iterable[str]) -> List[str]: tokens = set() for part in parts: normalized = normalize_word(part) if not normalized: continue tokens.add(normalized) tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part)))) topics = set() for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items(): if any(keyword in tokens for keyword in keywords): topics.add(topic) return sorted(topics) def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]: xml_text = IWN_XML_PATH.read_text(encoding="utf-8") xml_text = xml_text.replace('" -->', '">') root = ET.fromstring(xml_text) synsets: Dict[str, Dict[str, object]] = {} entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list) for synset in root.findall(".//Synset"): synset_id = synset.attrib.get("id", "") relations = [ { "type": relation.attrib.get("relType", ""), "target": relation.attrib.get("target", ""), "subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""), } for relation in synset.findall("SynsetRelation") ] synsets[synset_id] = { "id": synset_id, "ili": synset.attrib.get("ili", ""), "definition": (synset.findtext("Definition") or "").strip(), "relations": relations, "lemmas": [], "pos": "", } for lexical_entry in root.findall(".//LexicalEntry"): lemma = lexical_entry.find("Lemma") if lemma is None: continue written_form = lemma.attrib.get("writtenForm", "").strip() normalized_form = normalize_word(written_form) if not normalized_form: continue pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN") senses = lexical_entry.findall("Sense") sense_payloads = [] for sense in senses: synset_id = sense.attrib.get("synset", "") sense_id = sense.attrib.get("id", "") if not synset_id or synset_id not in synsets: continue synsets[synset_id]["lemmas"].append(written_form) synsets[synset_id]["pos"] = pos sense_payloads.append( { "sense_id": sense_id, "synset_id": synset_id, "pos": pos, } ) if sense_payloads: entries_by_norm[normalized_form].append( { "written_form": written_form, "normalized_form": normalized_form, "pos": pos, "senses": sense_payloads, } ) for synset in synsets.values(): unique_lemmas = [] seen = set() for lemma in synset["lemmas"]: if lemma not in seen: seen.add(lemma) unique_lemmas.append(lemma) synset["lemmas"] = unique_lemmas return synsets, entries_by_norm def score_sense( synset_id: str, current_topics: Iterable[str], synsets: Dict[str, Dict[str, object]], ) -> int: synset = synsets.get(synset_id, {}) definition = str(synset.get("definition", "")) inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", [])))) current_topics_set = set(str(topic) for topic in current_topics) overlap = len(inferred_topics & current_topics_set) return overlap * 10 + len(definition) def best_candidate( candidates: List[Dict[str, object]], expected_pos: str, current_topics: Iterable[str], synsets: Dict[str, Dict[str, object]], ) -> Dict[str, object]: ranked = [] for candidate in candidates: pos_bonus = 100 if candidate["pos"] == expected_pos else 0 sense_bonus = 0 for sense in candidate.get("senses", []): sense_bonus = max( sense_bonus, score_sense(str(sense.get("synset_id", "")), current_topics, synsets), ) ranked.append((pos_bonus + sense_bonus, candidate)) ranked.sort(key=lambda item: item[0], reverse=True) return ranked[0][1] def dedupe_keep_order(items: Iterable[str]) -> List[str]: seen = set() result = [] for item in items: text = str(item).strip() if not text or text in seen: continue seen.add(text) result.append(text) return result def resolve_related_lemmas( synset_ids: Iterable[str], relation_type: str, synsets: Dict[str, Dict[str, object]], ) -> List[str]: related = [] for synset_id in synset_ids: synset = synsets.get(synset_id) if not synset: continue for relation in synset.get("relations", []): if relation.get("type") != relation_type: continue target = relation.get("target", "") target_synset = synsets.get(target) if not target_synset: continue related.extend(target_synset.get("lemmas", [])) return dedupe_keep_order(related) def collect_relation_terms( synset_ids: Iterable[str], relation_types: Iterable[str], synsets: Dict[str, Dict[str, object]], ) -> Dict[str, List[str]]: return { relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20] for relation_type in relation_types } def enrich_entry( entry: Dict[str, object], synsets: Dict[str, Dict[str, object]], entries_by_norm: Dict[str, List[Dict[str, object]]], ) -> Dict[str, object]: normalized_candidates = dedupe_keep_order( [ normalize_word(str(entry.get("form", ""))), normalize_word(str(entry.get("lemma", ""))), normalize_word(str(entry.get("normalized_form", ""))), ] ) matches: List[Dict[str, object]] = [] for candidate_key in normalized_candidates: matches.extend(entries_by_norm.get(candidate_key, [])) if not matches: entry["semantic"] = { "source": "iwn-omw", "matched": False, "match_count": 0, "synsets": [], "synonyms": [], "raw_relation_terms": {}, "glosses": [], "semantic_topics": [], } return entry selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets) sorted_senses = sorted( selected.get("senses", []), key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets), reverse=True, ) synset_ids = [sense["synset_id"] for sense in sorted_senses] synset_payloads = [] glosses = [] synonyms = [] for synset_id in synset_ids[:3]: synset = synsets.get(synset_id) if not synset: continue gloss = str(synset.get("definition", "")).strip() glosses.append(gloss) synset_payloads.append( { "id": synset_id, "pos": synset.get("pos", ""), "definition": gloss, "lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12], "raw_relation_terms": collect_relation_terms( [synset_id], ("hypernym", "hyponym", "similar"), synsets, ), } ) synonyms.extend(synset.get("lemmas", [])) raw_relation_terms = collect_relation_terms( synset_ids, ("hypernym", "hyponym", "similar"), synsets, ) synonyms = [ lemma for lemma in dedupe_keep_order(synonyms) if normalize_word(lemma) != normalize_word(str(entry.get("form", ""))) ][:20] glosses = dedupe_keep_order(glosses) semantic_topics = dedupe_keep_order( list(entry.get("topics", [])) + semantic_topics_from_text( glosses + synonyms + raw_relation_terms.get("hypernym", []) + raw_relation_terms.get("hyponym", []) + raw_relation_terms.get("similar", []) ) ) entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics) entry["semantic"] = { "source": "iwn-omw", "matched": True, "match_count": len(matches), "selected_form": selected.get("written_form", ""), "synsets": synset_payloads, "synonyms": synonyms, "raw_relation_terms": raw_relation_terms, "glosses": glosses, "semantic_topics": semantic_topics, } return entry def build_semantic_lexicon() -> Dict[str, object]: if not LEXICON_OUTPUT_PATH.exists(): raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}") if not IWN_XML_PATH.exists(): raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}") payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8")) synsets, entries_by_norm = parse_iwn() enriched_entries = [] for entry in payload.get("entries", []): enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm)) return { "meta": { "language": "it", "version": 1, "base_lexicon": str(LEXICON_OUTPUT_PATH.name), "sources": [ "lexicon_it.json", "iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml", ], "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "entry_count": len(enriched_entries), "semantic_source": "IWN-OMW v1.0", }, "entries": enriched_entries, } def main() -> None: payload = build_semantic_lexicon() SEMANTIC_LEXICON_OUTPUT_PATH.write_text( json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8", ) matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched")) print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}") print(f"Voci totali: {payload['meta']['entry_count']}") print(f"Voci con match semantico: {matched}") if __name__ == "__main__": main()