Files
cruciverba_1/build_semantic_lexicon.py

427 lines
14 KiB
Python

from __future__ import annotations
import json
import re
import unicodedata
import xml.etree.ElementTree as ET
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Tuple
from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
IWN_POS_MAP = {
"n": "NOUN",
"v": "VERB",
"a": "ADJ",
"s": "ADJ",
"r": "ADV",
}
SEMANTIC_TOPIC_KEYWORDS = {
"animals": {
"animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
"rettile", "domestico", "compagnia", "caccia", "pastorizia",
},
"plants": {
"pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
"ghianda", "bosco", "radice", "seme", "vegetale",
},
"nature": {
"natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
"lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
},
"ecology": {
"ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
"ecosistema", "acqua", "terra",
},
"geography": {
"territorio", "superficie", "terrestre", "regione", "confine", "montagna",
"isola", "mare", "lago", "fiume",
},
"weather": {
"clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
"atmosfera",
},
"sea": {
"mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
"nave", "fondale",
},
"mountain": {
"montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
},
"health": {
"salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
"terapia", "cervello", "respiro",
},
"science": {
"scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
"chimica", "biologia", "strumento",
},
"sport": {
"sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
"agonistico",
},
"history": {
"storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
},
"school": {
"scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
"quaderno", "aula",
},
"cinema": {
"film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
"teatro",
},
"literature": {
"libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
"letteratura",
},
"food": {
"cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
"gelato",
},
"city": {
"citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
"quartiere",
},
"transport": {
"veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
"aereo", "automobile",
},
"work": {
"lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
},
"home": {
"casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
"tavolo", "sedia",
},
}
def normalize_word(text: str) -> str:
normalized = unicodedata.normalize("NFKD", text)
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
ascii_only = ascii_only.lower().replace("_", "")
ascii_only = re.sub(r"[^a-z]", "", ascii_only)
return ascii_only
def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
tokens = set()
for part in parts:
normalized = normalize_word(part)
if not normalized:
continue
tokens.add(normalized)
tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
topics = set()
for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
if any(keyword in tokens for keyword in keywords):
topics.add(topic)
return sorted(topics)
def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
xml_text = xml_text.replace('" -->', '">')
root = ET.fromstring(xml_text)
synsets: Dict[str, Dict[str, object]] = {}
entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
for synset in root.findall(".//Synset"):
synset_id = synset.attrib.get("id", "")
relations = [
{
"type": relation.attrib.get("relType", ""),
"target": relation.attrib.get("target", ""),
"subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
}
for relation in synset.findall("SynsetRelation")
]
synsets[synset_id] = {
"id": synset_id,
"ili": synset.attrib.get("ili", ""),
"definition": (synset.findtext("Definition") or "").strip(),
"relations": relations,
"lemmas": [],
"pos": "",
}
for lexical_entry in root.findall(".//LexicalEntry"):
lemma = lexical_entry.find("Lemma")
if lemma is None:
continue
written_form = lemma.attrib.get("writtenForm", "").strip()
normalized_form = normalize_word(written_form)
if not normalized_form:
continue
pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
senses = lexical_entry.findall("Sense")
sense_payloads = []
for sense in senses:
synset_id = sense.attrib.get("synset", "")
sense_id = sense.attrib.get("id", "")
if not synset_id or synset_id not in synsets:
continue
synsets[synset_id]["lemmas"].append(written_form)
synsets[synset_id]["pos"] = pos
sense_payloads.append(
{
"sense_id": sense_id,
"synset_id": synset_id,
"pos": pos,
}
)
if sense_payloads:
entries_by_norm[normalized_form].append(
{
"written_form": written_form,
"normalized_form": normalized_form,
"pos": pos,
"senses": sense_payloads,
}
)
for synset in synsets.values():
unique_lemmas = []
seen = set()
for lemma in synset["lemmas"]:
if lemma not in seen:
seen.add(lemma)
unique_lemmas.append(lemma)
synset["lemmas"] = unique_lemmas
return synsets, entries_by_norm
def score_sense(
synset_id: str,
current_topics: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> int:
synset = synsets.get(synset_id, {})
definition = str(synset.get("definition", ""))
inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
current_topics_set = set(str(topic) for topic in current_topics)
overlap = len(inferred_topics & current_topics_set)
return overlap * 10 + len(definition)
def best_candidate(
candidates: List[Dict[str, object]],
expected_pos: str,
current_topics: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> Dict[str, object]:
ranked = []
for candidate in candidates:
pos_bonus = 100 if candidate["pos"] == expected_pos else 0
sense_bonus = 0
for sense in candidate.get("senses", []):
sense_bonus = max(
sense_bonus,
score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
)
ranked.append((pos_bonus + sense_bonus, candidate))
ranked.sort(key=lambda item: item[0], reverse=True)
return ranked[0][1]
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
seen = set()
result = []
for item in items:
text = str(item).strip()
if not text or text in seen:
continue
seen.add(text)
result.append(text)
return result
def resolve_related_lemmas(
synset_ids: Iterable[str],
relation_type: str,
synsets: Dict[str, Dict[str, object]],
) -> List[str]:
related = []
for synset_id in synset_ids:
synset = synsets.get(synset_id)
if not synset:
continue
for relation in synset.get("relations", []):
if relation.get("type") != relation_type:
continue
target = relation.get("target", "")
target_synset = synsets.get(target)
if not target_synset:
continue
related.extend(target_synset.get("lemmas", []))
return dedupe_keep_order(related)
def collect_relation_terms(
synset_ids: Iterable[str],
relation_types: Iterable[str],
synsets: Dict[str, Dict[str, object]],
) -> Dict[str, List[str]]:
return {
relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
for relation_type in relation_types
}
def enrich_entry(
entry: Dict[str, object],
synsets: Dict[str, Dict[str, object]],
entries_by_norm: Dict[str, List[Dict[str, object]]],
) -> Dict[str, object]:
normalized_candidates = dedupe_keep_order(
[
normalize_word(str(entry.get("form", ""))),
normalize_word(str(entry.get("lemma", ""))),
normalize_word(str(entry.get("normalized_form", ""))),
]
)
matches: List[Dict[str, object]] = []
for candidate_key in normalized_candidates:
matches.extend(entries_by_norm.get(candidate_key, []))
if not matches:
entry["semantic"] = {
"source": "iwn-omw",
"matched": False,
"match_count": 0,
"synsets": [],
"synonyms": [],
"raw_relation_terms": {},
"glosses": [],
"semantic_topics": [],
}
return entry
selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
sorted_senses = sorted(
selected.get("senses", []),
key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
reverse=True,
)
synset_ids = [sense["synset_id"] for sense in sorted_senses]
synset_payloads = []
glosses = []
synonyms = []
for synset_id in synset_ids[:3]:
synset = synsets.get(synset_id)
if not synset:
continue
gloss = str(synset.get("definition", "")).strip()
glosses.append(gloss)
synset_payloads.append(
{
"id": synset_id,
"pos": synset.get("pos", ""),
"definition": gloss,
"lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
"raw_relation_terms": collect_relation_terms(
[synset_id],
("hypernym", "hyponym", "similar"),
synsets,
),
}
)
synonyms.extend(synset.get("lemmas", []))
raw_relation_terms = collect_relation_terms(
synset_ids,
("hypernym", "hyponym", "similar"),
synsets,
)
synonyms = [
lemma
for lemma in dedupe_keep_order(synonyms)
if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
][:20]
glosses = dedupe_keep_order(glosses)
semantic_topics = dedupe_keep_order(
list(entry.get("topics", []))
+ semantic_topics_from_text(
glosses
+ synonyms
+ raw_relation_terms.get("hypernym", [])
+ raw_relation_terms.get("hyponym", [])
+ raw_relation_terms.get("similar", [])
)
)
entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
entry["semantic"] = {
"source": "iwn-omw",
"matched": True,
"match_count": len(matches),
"selected_form": selected.get("written_form", ""),
"synsets": synset_payloads,
"synonyms": synonyms,
"raw_relation_terms": raw_relation_terms,
"glosses": glosses,
"semantic_topics": semantic_topics,
}
return entry
def build_semantic_lexicon() -> Dict[str, object]:
if not LEXICON_OUTPUT_PATH.exists():
raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
if not IWN_XML_PATH.exists():
raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
synsets, entries_by_norm = parse_iwn()
enriched_entries = []
for entry in payload.get("entries", []):
enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
return {
"meta": {
"language": "it",
"version": 1,
"base_lexicon": str(LEXICON_OUTPUT_PATH.name),
"sources": [
"lexicon_it.json",
"iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
],
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"entry_count": len(enriched_entries),
"semantic_source": "IWN-OMW v1.0",
},
"entries": enriched_entries,
}
def main() -> None:
payload = build_semantic_lexicon()
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
json.dumps(payload, ensure_ascii=False, indent=2),
encoding="utf-8",
)
matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
print(f"Voci totali: {payload['meta']['entry_count']}")
print(f"Voci con match semantico: {matched}")
if __name__ == "__main__":
main()