425 lines
14 KiB
Python
425 lines
14 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import re
|
|
import unicodedata
|
|
import xml.etree.ElementTree as ET
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Tuple
|
|
|
|
from build_lexicon import LEXICON_OUTPUT_PATH
|
|
|
|
|
|
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
|
|
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
|
|
|
|
IWN_POS_MAP = {
|
|
"n": "NOUN",
|
|
"v": "VERB",
|
|
"a": "ADJ",
|
|
"s": "ADJ",
|
|
"r": "ADV",
|
|
}
|
|
|
|
SEMANTIC_TOPIC_KEYWORDS = {
|
|
"animals": {
|
|
"animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
|
|
"rettile", "domestico", "compagnia", "caccia", "pastorizia",
|
|
},
|
|
"plants": {
|
|
"pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
|
|
"ghianda", "bosco", "radice", "seme", "vegetale",
|
|
},
|
|
"nature": {
|
|
"natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
|
|
"lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
|
|
},
|
|
"ecology": {
|
|
"ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
|
|
"ecosistema", "acqua", "terra",
|
|
},
|
|
"geography": {
|
|
"territorio", "superficie", "terrestre", "regione", "confine", "montagna",
|
|
"isola", "mare", "lago", "fiume",
|
|
},
|
|
"weather": {
|
|
"clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
|
|
"atmosfera",
|
|
},
|
|
"sea": {
|
|
"mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
|
|
"nave", "fondale",
|
|
},
|
|
"mountain": {
|
|
"montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
|
|
},
|
|
"health": {
|
|
"salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
|
|
"terapia", "cervello", "respiro",
|
|
},
|
|
"science": {
|
|
"scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
|
|
"chimica", "biologia", "strumento",
|
|
},
|
|
"sport": {
|
|
"sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
|
|
"agonistico",
|
|
},
|
|
"history": {
|
|
"storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
|
|
},
|
|
"school": {
|
|
"scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
|
|
"quaderno", "aula",
|
|
},
|
|
"cinema": {
|
|
"film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
|
|
"teatro",
|
|
},
|
|
"literature": {
|
|
"libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
|
|
"letteratura",
|
|
},
|
|
"food": {
|
|
"cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
|
|
"gelato",
|
|
},
|
|
"city": {
|
|
"citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
|
|
"quartiere",
|
|
},
|
|
"transport": {
|
|
"veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
|
|
"aereo", "automobile",
|
|
},
|
|
"work": {
|
|
"lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
|
|
},
|
|
"home": {
|
|
"casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
|
|
"tavolo", "sedia",
|
|
},
|
|
}
|
|
|
|
|
|
def normalize_word(text: str) -> str:
|
|
normalized = unicodedata.normalize("NFKD", text)
|
|
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
|
ascii_only = ascii_only.lower().replace("_", "")
|
|
ascii_only = re.sub(r"[^a-z]", "", ascii_only)
|
|
return ascii_only
|
|
|
|
|
|
def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
|
|
tokens = set()
|
|
for part in parts:
|
|
normalized = normalize_word(part)
|
|
if not normalized:
|
|
continue
|
|
tokens.add(normalized)
|
|
tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
|
|
|
|
topics = set()
|
|
for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
|
|
if any(keyword in tokens for keyword in keywords):
|
|
topics.add(topic)
|
|
return sorted(topics)
|
|
|
|
|
|
def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
|
|
xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
|
|
xml_text = xml_text.replace('" -->', '">')
|
|
root = ET.fromstring(xml_text)
|
|
|
|
synsets: Dict[str, Dict[str, object]] = {}
|
|
entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
|
|
|
|
for synset in root.findall(".//Synset"):
|
|
synset_id = synset.attrib.get("id", "")
|
|
relations = [
|
|
{
|
|
"type": relation.attrib.get("relType", ""),
|
|
"target": relation.attrib.get("target", ""),
|
|
"subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
|
|
}
|
|
for relation in synset.findall("SynsetRelation")
|
|
]
|
|
synsets[synset_id] = {
|
|
"id": synset_id,
|
|
"ili": synset.attrib.get("ili", ""),
|
|
"definition": (synset.findtext("Definition") or "").strip(),
|
|
"relations": relations,
|
|
"lemmas": [],
|
|
"pos": "",
|
|
}
|
|
|
|
for lexical_entry in root.findall(".//LexicalEntry"):
|
|
lemma = lexical_entry.find("Lemma")
|
|
if lemma is None:
|
|
continue
|
|
|
|
written_form = lemma.attrib.get("writtenForm", "").strip()
|
|
normalized_form = normalize_word(written_form)
|
|
if not normalized_form:
|
|
continue
|
|
|
|
pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
|
|
senses = lexical_entry.findall("Sense")
|
|
sense_payloads = []
|
|
|
|
for sense in senses:
|
|
synset_id = sense.attrib.get("synset", "")
|
|
sense_id = sense.attrib.get("id", "")
|
|
if not synset_id or synset_id not in synsets:
|
|
continue
|
|
|
|
synsets[synset_id]["lemmas"].append(written_form)
|
|
synsets[synset_id]["pos"] = pos
|
|
sense_payloads.append(
|
|
{
|
|
"sense_id": sense_id,
|
|
"synset_id": synset_id,
|
|
"pos": pos,
|
|
}
|
|
)
|
|
|
|
if sense_payloads:
|
|
entries_by_norm[normalized_form].append(
|
|
{
|
|
"written_form": written_form,
|
|
"normalized_form": normalized_form,
|
|
"pos": pos,
|
|
"senses": sense_payloads,
|
|
}
|
|
)
|
|
|
|
for synset in synsets.values():
|
|
unique_lemmas = []
|
|
seen = set()
|
|
for lemma in synset["lemmas"]:
|
|
if lemma not in seen:
|
|
seen.add(lemma)
|
|
unique_lemmas.append(lemma)
|
|
synset["lemmas"] = unique_lemmas
|
|
|
|
return synsets, entries_by_norm
|
|
|
|
|
|
def score_sense(
|
|
synset_id: str,
|
|
current_topics: Iterable[str],
|
|
synsets: Dict[str, Dict[str, object]],
|
|
) -> int:
|
|
synset = synsets.get(synset_id, {})
|
|
definition = str(synset.get("definition", ""))
|
|
inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
|
|
current_topics_set = set(str(topic) for topic in current_topics)
|
|
overlap = len(inferred_topics & current_topics_set)
|
|
return overlap * 10 + len(definition)
|
|
|
|
|
|
def best_candidate(
|
|
candidates: List[Dict[str, object]],
|
|
expected_pos: str,
|
|
current_topics: Iterable[str],
|
|
synsets: Dict[str, Dict[str, object]],
|
|
) -> Dict[str, object]:
|
|
ranked = []
|
|
for candidate in candidates:
|
|
pos_bonus = 100 if candidate["pos"] == expected_pos else 0
|
|
sense_bonus = 0
|
|
for sense in candidate.get("senses", []):
|
|
sense_bonus = max(
|
|
sense_bonus,
|
|
score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
|
|
)
|
|
ranked.append((pos_bonus + sense_bonus, candidate))
|
|
ranked.sort(key=lambda item: item[0], reverse=True)
|
|
return ranked[0][1]
|
|
|
|
|
|
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
|
|
seen = set()
|
|
result = []
|
|
for item in items:
|
|
text = str(item).strip()
|
|
if not text or text in seen:
|
|
continue
|
|
seen.add(text)
|
|
result.append(text)
|
|
return result
|
|
|
|
|
|
def resolve_related_lemmas(
|
|
synset_ids: Iterable[str],
|
|
relation_type: str,
|
|
synsets: Dict[str, Dict[str, object]],
|
|
) -> List[str]:
|
|
related = []
|
|
for synset_id in synset_ids:
|
|
synset = synsets.get(synset_id)
|
|
if not synset:
|
|
continue
|
|
for relation in synset.get("relations", []):
|
|
if relation.get("type") != relation_type:
|
|
continue
|
|
target = relation.get("target", "")
|
|
target_synset = synsets.get(target)
|
|
if not target_synset:
|
|
continue
|
|
related.extend(target_synset.get("lemmas", []))
|
|
return dedupe_keep_order(related)
|
|
|
|
|
|
def collect_relation_terms(
|
|
synset_ids: Iterable[str],
|
|
relation_types: Iterable[str],
|
|
synsets: Dict[str, Dict[str, object]],
|
|
) -> Dict[str, List[str]]:
|
|
return {
|
|
relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
|
|
for relation_type in relation_types
|
|
}
|
|
|
|
|
|
def enrich_entry(
|
|
entry: Dict[str, object],
|
|
synsets: Dict[str, Dict[str, object]],
|
|
entries_by_norm: Dict[str, List[Dict[str, object]]],
|
|
) -> Dict[str, object]:
|
|
normalized_candidates = dedupe_keep_order(
|
|
[
|
|
normalize_word(str(entry.get("form", ""))),
|
|
normalize_word(str(entry.get("lemma", ""))),
|
|
normalize_word(str(entry.get("normalized_form", ""))),
|
|
]
|
|
)
|
|
matches: List[Dict[str, object]] = []
|
|
for candidate_key in normalized_candidates:
|
|
matches.extend(entries_by_norm.get(candidate_key, []))
|
|
|
|
if not matches:
|
|
entry["semantic"] = {
|
|
"source": "iwn-omw",
|
|
"matched": False,
|
|
"match_count": 0,
|
|
"synsets": [],
|
|
"synonyms": [],
|
|
"raw_relation_terms": {},
|
|
"glosses": [],
|
|
"semantic_topics": [],
|
|
}
|
|
return entry
|
|
|
|
selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
|
|
sorted_senses = sorted(
|
|
selected.get("senses", []),
|
|
key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
|
|
reverse=True,
|
|
)
|
|
synset_ids = [sense["synset_id"] for sense in sorted_senses]
|
|
synset_payloads = []
|
|
glosses = []
|
|
synonyms = []
|
|
for synset_id in synset_ids[:3]:
|
|
synset = synsets.get(synset_id)
|
|
if not synset:
|
|
continue
|
|
gloss = str(synset.get("definition", "")).strip()
|
|
glosses.append(gloss)
|
|
synset_payloads.append(
|
|
{
|
|
"id": synset_id,
|
|
"pos": synset.get("pos", ""),
|
|
"definition": gloss,
|
|
"lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
|
|
"raw_relation_terms": collect_relation_terms(
|
|
[synset_id],
|
|
("hypernym", "hyponym", "similar"),
|
|
synsets,
|
|
),
|
|
}
|
|
)
|
|
synonyms.extend(synset.get("lemmas", []))
|
|
|
|
raw_relation_terms = collect_relation_terms(
|
|
synset_ids,
|
|
("hypernym", "hyponym", "similar"),
|
|
synsets,
|
|
)
|
|
synonyms = [
|
|
lemma
|
|
for lemma in dedupe_keep_order(synonyms)
|
|
if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
|
|
][:20]
|
|
glosses = dedupe_keep_order(glosses)
|
|
semantic_topics = dedupe_keep_order(
|
|
semantic_topics_from_text(
|
|
glosses
|
|
+ synonyms
|
|
+ raw_relation_terms.get("hypernym", [])
|
|
+ raw_relation_terms.get("hyponym", [])
|
|
+ raw_relation_terms.get("similar", [])
|
|
)
|
|
)
|
|
entry["semantic"] = {
|
|
"source": "iwn-omw",
|
|
"matched": True,
|
|
"match_count": len(matches),
|
|
"selected_form": selected.get("written_form", ""),
|
|
"synsets": synset_payloads,
|
|
"synonyms": synonyms,
|
|
"raw_relation_terms": raw_relation_terms,
|
|
"glosses": glosses,
|
|
"semantic_topics": semantic_topics,
|
|
}
|
|
return entry
|
|
|
|
|
|
def build_semantic_lexicon() -> Dict[str, object]:
|
|
if not LEXICON_OUTPUT_PATH.exists():
|
|
raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
|
|
if not IWN_XML_PATH.exists():
|
|
raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
|
|
|
|
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
|
synsets, entries_by_norm = parse_iwn()
|
|
|
|
enriched_entries = []
|
|
for entry in payload.get("entries", []):
|
|
enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
|
|
|
|
return {
|
|
"meta": {
|
|
"language": "it",
|
|
"version": 1,
|
|
"base_lexicon": str(LEXICON_OUTPUT_PATH.name),
|
|
"sources": [
|
|
"lexicon_it.json",
|
|
"iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
|
|
],
|
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
|
"entry_count": len(enriched_entries),
|
|
"semantic_source": "IWN-OMW v1.0",
|
|
},
|
|
"entries": enriched_entries,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
payload = build_semantic_lexicon()
|
|
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
|
|
print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
|
print(f"Voci totali: {payload['meta']['entry_count']}")
|
|
print(f"Voci con match semantico: {matched}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|