feat: collega il lessico semantico al filler
This commit is contained in:
BIN
__pycache__/build_lexicon.cpython-313.pyc
Normal file
BIN
__pycache__/build_lexicon.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/build_semantic_lexicon.cpython-313.pyc
Normal file
BIN
__pycache__/build_semantic_lexicon.cpython-313.pyc
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
249
build_lexicon.py
Normal file
249
build_lexicon.py
Normal file
@@ -0,0 +1,249 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from build_vocabulary import (
|
||||||
|
FILTERED_OUTPUT_PATH,
|
||||||
|
METADATA_OUTPUT_PATH,
|
||||||
|
build_vocabulary,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it.json")
|
||||||
|
|
||||||
|
POS_BY_TAG = {
|
||||||
|
"function": "PREP",
|
||||||
|
"verb_infinitive": "VERB",
|
||||||
|
"adverb": "ADV",
|
||||||
|
"adjective_like": "ADJ",
|
||||||
|
"noun_like": "NOUN",
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_BY_QUALITY = [
|
||||||
|
(8, "common"),
|
||||||
|
(5, "standard"),
|
||||||
|
(3, "formal"),
|
||||||
|
(0, "rare"),
|
||||||
|
]
|
||||||
|
|
||||||
|
TOPIC_KEYWORDS = {
|
||||||
|
"animals": {
|
||||||
|
"cane", "gatto", "lupo", "volpe", "orso", "pesce", "cervo", "cavallo", "capra", "pecora",
|
||||||
|
"leone", "tigre", "zebra", "aquila", "falco", "serpente", "vipera", "gabbiano", "anatra",
|
||||||
|
"passero", "coniglio", "castoro", "bruco", "cigno", "asino", "alpaca",
|
||||||
|
},
|
||||||
|
"plants": {
|
||||||
|
"albero", "pianta", "fiore", "foglia", "radice", "seme", "bosco", "selva", "ulivo", "quercia",
|
||||||
|
"ortica", "edera", "aloe", "tulipano", "spiga", "polline", "grano", "erba",
|
||||||
|
},
|
||||||
|
"nature": {
|
||||||
|
"natura", "bosco", "selva", "montagna", "collina", "roccia", "pietra", "fiume", "lago", "mare",
|
||||||
|
"riva", "fonte", "onda", "vento", "fuoco", "terra", "sole", "luna", "aurora", "nuvola",
|
||||||
|
"nebbia", "deserto", "isola", "greto", "radice", "fiore", "foglia", "erba", "zolla",
|
||||||
|
},
|
||||||
|
"ecology": {
|
||||||
|
"ambiente", "ecologia", "natura", "bosco", "energia", "acqua", "terra", "clima", "sorgere",
|
||||||
|
"fonte", "solare", "verde", "ulivo", "pianta", "polline", "grano", "radice",
|
||||||
|
},
|
||||||
|
"geography": {
|
||||||
|
"montagna", "collina", "isola", "deserto", "equatore", "ovest", "oriente", "riva", "mare",
|
||||||
|
"lago", "fiume", "ponte", "confine", "quota", "pianeta", "roccia", "greto",
|
||||||
|
},
|
||||||
|
"weather": {
|
||||||
|
"vento", "nebbia", "aurora", "pioggia", "sole", "nuvola", "tempesta", "brina", "sereno",
|
||||||
|
"clima", "goccia",
|
||||||
|
},
|
||||||
|
"sea": {
|
||||||
|
"mare", "onda", "vela", "barca", "porto", "pesce", "ancora", "scoglio", "riva", "veliero",
|
||||||
|
},
|
||||||
|
"mountain": {
|
||||||
|
"montagna", "quota", "vetta", "roccia", "greto", "collina", "sentiero", "alpino",
|
||||||
|
},
|
||||||
|
"health": {
|
||||||
|
"salute", "febbre", "medico", "cura", "respiro", "diuretico", "anemico", "vigore", "energia",
|
||||||
|
"dente", "cuore", "corpo", "viso",
|
||||||
|
},
|
||||||
|
"science": {
|
||||||
|
"atomo", "energia", "metodo", "equatore", "digitale", "misura", "tecnica", "triangolo",
|
||||||
|
"microfibra", "microscopio", "algoritmo", "motore", "materia", "liquido",
|
||||||
|
},
|
||||||
|
"sport": {
|
||||||
|
"calcio", "atleta", "sportivo", "gol", "pallone", "gara", "trionfo", "primato", "allenatore",
|
||||||
|
"stadio", "squadra", "rete",
|
||||||
|
},
|
||||||
|
"history": {
|
||||||
|
"re", "principe", "regno", "impero", "senato", "console", "legione", "vittoria", "epoca",
|
||||||
|
"origine", "ritorno",
|
||||||
|
},
|
||||||
|
"school": {
|
||||||
|
"libro", "quaderno", "lezione", "classe", "studiare", "maestro", "scuola", "esame", "penna",
|
||||||
|
"aula", "figura", "titolo",
|
||||||
|
},
|
||||||
|
"cinema": {
|
||||||
|
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "voce", "visione",
|
||||||
|
"finale", "figura",
|
||||||
|
},
|
||||||
|
"literature": {
|
||||||
|
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
|
||||||
|
"leggenda", "scrivere", "titolo",
|
||||||
|
},
|
||||||
|
"food": {
|
||||||
|
"pane", "cacao", "gelato", "burro", "latte", "mandorla", "nocciola", "cena", "pranzo",
|
||||||
|
"zuppa", "zucchero", "acqua", "fiore", "frutto",
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
"porta", "strada", "piazza", "ponte", "palazzo", "cortile", "villaggio", "citta", "urbano",
|
||||||
|
"casale", "balcone", "finestra", "stazione",
|
||||||
|
},
|
||||||
|
"transport": {
|
||||||
|
"automobile", "barca", "vela", "treno", "motore", "viaggio", "ruota", "ponte", "pilota",
|
||||||
|
"volo", "aeroporto", "vettura",
|
||||||
|
},
|
||||||
|
"work": {
|
||||||
|
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
|
||||||
|
"mestiere", "servire",
|
||||||
|
},
|
||||||
|
"home": {
|
||||||
|
"casa", "finestra", "porta", "parete", "divano", "tavolo", "sedia", "camera", "balcone",
|
||||||
|
"camino", "tetto", "cortile", "vasca",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
TOPIC_SUFFIXES = {
|
||||||
|
"actions": ("are", "ere", "ire"),
|
||||||
|
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
|
||||||
|
"animals": ("cane", "gatto", "lupo", "pesce", "volpe", "orso"),
|
||||||
|
"plants": ("fiore", "foglia", "seme", "radice", "erba"),
|
||||||
|
"nature": ("mare", "lago", "bosco", "vento", "onda", "roccia"),
|
||||||
|
"geography": ("montagna", "isola", "deserto", "confine"),
|
||||||
|
"city": ("strada", "palazzo", "porta", "ponte"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def infer_pos(tags: List[str]) -> str:
|
||||||
|
for tag in tags:
|
||||||
|
if tag in POS_BY_TAG:
|
||||||
|
return POS_BY_TAG[tag]
|
||||||
|
return "NOUN"
|
||||||
|
|
||||||
|
|
||||||
|
def infer_topics(word: str, tags: List[str]) -> List[str]:
|
||||||
|
topics = {"general"}
|
||||||
|
|
||||||
|
if "verb_infinitive" in tags:
|
||||||
|
topics.add("actions")
|
||||||
|
if any(word.endswith(suffix) for suffix in ("tore", "trice", "zione", "ismo", "ista", "mento", "anza", "enza")):
|
||||||
|
topics.add("abstract")
|
||||||
|
|
||||||
|
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||||
|
if word in keywords:
|
||||||
|
topics.add(topic)
|
||||||
|
|
||||||
|
for topic, suffixes in TOPIC_SUFFIXES.items():
|
||||||
|
if any(word.endswith(suffix) for suffix in suffixes):
|
||||||
|
topics.add(topic)
|
||||||
|
|
||||||
|
if "animals" in topics:
|
||||||
|
topics.add("nature")
|
||||||
|
if "plants" in topics:
|
||||||
|
topics.update({"nature", "ecology"})
|
||||||
|
if "sea" in topics or "mountain" in topics or "weather" in topics:
|
||||||
|
topics.add("nature")
|
||||||
|
if "geography" in topics and "nature" not in topics:
|
||||||
|
topics.add("nature")
|
||||||
|
|
||||||
|
return sorted(topics)
|
||||||
|
|
||||||
|
|
||||||
|
def infer_register(quality: int) -> str:
|
||||||
|
for threshold, label in REGISTER_BY_QUALITY:
|
||||||
|
if quality >= threshold:
|
||||||
|
return label
|
||||||
|
return "rare"
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_from_quality(quality: int, index: int, total: int) -> tuple[int, float]:
|
||||||
|
rank = index + 1
|
||||||
|
normalized_rank = 1.0 - (rank - 1) / max(1, total - 1)
|
||||||
|
quality_boost = min(max(quality, 0), 10) / 20.0
|
||||||
|
frequency_score = round(min(1.0, normalized_rank * 0.7 + quality_boost), 4)
|
||||||
|
return rank, frequency_score
|
||||||
|
|
||||||
|
|
||||||
|
def load_words() -> List[str]:
|
||||||
|
if not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists():
|
||||||
|
build_vocabulary()
|
||||||
|
|
||||||
|
words = [
|
||||||
|
line.strip()
|
||||||
|
for line in FILTERED_OUTPUT_PATH.read_text(encoding="utf-8").splitlines()
|
||||||
|
if line.strip()
|
||||||
|
]
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
def load_metadata() -> Dict[str, Dict[str, object]]:
|
||||||
|
if not METADATA_OUTPUT_PATH.exists():
|
||||||
|
build_vocabulary()
|
||||||
|
return json.loads(METADATA_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def build_lexicon() -> Dict[str, object]:
|
||||||
|
words = load_words()
|
||||||
|
metadata = load_metadata()
|
||||||
|
|
||||||
|
entries = []
|
||||||
|
total = len(words)
|
||||||
|
for index, word in enumerate(words):
|
||||||
|
meta = metadata.get(word, {})
|
||||||
|
tags = list(meta.get("tags", []))
|
||||||
|
quality = int(meta.get("quality", 0))
|
||||||
|
frequency_rank, frequency_score = frequency_from_quality(quality, index, total)
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"form": word,
|
||||||
|
"normalized_form": word,
|
||||||
|
"lemma": word,
|
||||||
|
"pos": infer_pos(tags),
|
||||||
|
"length": len(word),
|
||||||
|
"frequency_rank": frequency_rank,
|
||||||
|
"frequency_score": frequency_score,
|
||||||
|
"difficulty_word": max(1, min(5, 6 - max(1, min(5, quality // 2 + 1)))),
|
||||||
|
"allowed_in_crossword": True,
|
||||||
|
"quality_score": max(0, min(10, quality)),
|
||||||
|
"topics": infer_topics(word, tags),
|
||||||
|
"morph_features": {},
|
||||||
|
"register": infer_register(quality),
|
||||||
|
"source_flags": ["from_filtered_vocabulary", "from_metadata_heuristics"],
|
||||||
|
"crossword_flags": tags,
|
||||||
|
"notes": "",
|
||||||
|
}
|
||||||
|
entries.append(entry)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"meta": {
|
||||||
|
"language": "it",
|
||||||
|
"version": 1,
|
||||||
|
"sources": ["vocaboli_it_filtrato.txt", "vocaboli_it_metadata.json"],
|
||||||
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||||
|
"entry_count": len(entries),
|
||||||
|
},
|
||||||
|
"entries": entries,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
lexicon = build_lexicon()
|
||||||
|
LEXICON_OUTPUT_PATH.write_text(
|
||||||
|
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
print(f"Lessico generato: {LEXICON_OUTPUT_PATH}")
|
||||||
|
print(f"Voci generate: {lexicon['meta']['entry_count']}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
426
build_semantic_lexicon.py
Normal file
426
build_semantic_lexicon.py
Normal file
@@ -0,0 +1,426 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Iterable, List, Tuple
|
||||||
|
|
||||||
|
from build_lexicon import LEXICON_OUTPUT_PATH, infer_topics
|
||||||
|
|
||||||
|
|
||||||
|
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
|
||||||
|
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
|
||||||
|
|
||||||
|
IWN_POS_MAP = {
|
||||||
|
"n": "NOUN",
|
||||||
|
"v": "VERB",
|
||||||
|
"a": "ADJ",
|
||||||
|
"s": "ADJ",
|
||||||
|
"r": "ADV",
|
||||||
|
}
|
||||||
|
|
||||||
|
SEMANTIC_TOPIC_KEYWORDS = {
|
||||||
|
"animals": {
|
||||||
|
"animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
|
||||||
|
"rettile", "domestico", "compagnia", "caccia", "pastorizia",
|
||||||
|
},
|
||||||
|
"plants": {
|
||||||
|
"pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
|
||||||
|
"ghianda", "bosco", "radice", "seme", "vegetale",
|
||||||
|
},
|
||||||
|
"nature": {
|
||||||
|
"natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
|
||||||
|
"lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
|
||||||
|
},
|
||||||
|
"ecology": {
|
||||||
|
"ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
|
||||||
|
"ecosistema", "acqua", "terra",
|
||||||
|
},
|
||||||
|
"geography": {
|
||||||
|
"territorio", "superficie", "terrestre", "regione", "confine", "montagna",
|
||||||
|
"isola", "mare", "lago", "fiume",
|
||||||
|
},
|
||||||
|
"weather": {
|
||||||
|
"clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
|
||||||
|
"atmosfera",
|
||||||
|
},
|
||||||
|
"sea": {
|
||||||
|
"mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
|
||||||
|
"nave", "fondale",
|
||||||
|
},
|
||||||
|
"mountain": {
|
||||||
|
"montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
|
||||||
|
},
|
||||||
|
"health": {
|
||||||
|
"salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
|
||||||
|
"terapia", "cervello", "respiro",
|
||||||
|
},
|
||||||
|
"science": {
|
||||||
|
"scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
|
||||||
|
"chimica", "biologia", "strumento",
|
||||||
|
},
|
||||||
|
"sport": {
|
||||||
|
"sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
|
||||||
|
"agonistico",
|
||||||
|
},
|
||||||
|
"history": {
|
||||||
|
"storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
|
||||||
|
},
|
||||||
|
"school": {
|
||||||
|
"scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
|
||||||
|
"quaderno", "aula",
|
||||||
|
},
|
||||||
|
"cinema": {
|
||||||
|
"film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
|
||||||
|
"teatro",
|
||||||
|
},
|
||||||
|
"literature": {
|
||||||
|
"libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
|
||||||
|
"letteratura",
|
||||||
|
},
|
||||||
|
"food": {
|
||||||
|
"cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
|
||||||
|
"gelato",
|
||||||
|
},
|
||||||
|
"city": {
|
||||||
|
"citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
|
||||||
|
"quartiere",
|
||||||
|
},
|
||||||
|
"transport": {
|
||||||
|
"veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
|
||||||
|
"aereo", "automobile",
|
||||||
|
},
|
||||||
|
"work": {
|
||||||
|
"lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
|
||||||
|
},
|
||||||
|
"home": {
|
||||||
|
"casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
|
||||||
|
"tavolo", "sedia",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_word(text: str) -> str:
|
||||||
|
normalized = unicodedata.normalize("NFKD", text)
|
||||||
|
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
||||||
|
ascii_only = ascii_only.lower().replace("_", "")
|
||||||
|
ascii_only = re.sub(r"[^a-z]", "", ascii_only)
|
||||||
|
return ascii_only
|
||||||
|
|
||||||
|
|
||||||
|
def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
|
||||||
|
tokens = set()
|
||||||
|
for part in parts:
|
||||||
|
normalized = normalize_word(part)
|
||||||
|
if not normalized:
|
||||||
|
continue
|
||||||
|
tokens.add(normalized)
|
||||||
|
tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
|
||||||
|
|
||||||
|
topics = set()
|
||||||
|
for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
|
||||||
|
if any(keyword in tokens for keyword in keywords):
|
||||||
|
topics.add(topic)
|
||||||
|
return sorted(topics)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
|
||||||
|
xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
|
||||||
|
xml_text = xml_text.replace('" -->', '">')
|
||||||
|
root = ET.fromstring(xml_text)
|
||||||
|
|
||||||
|
synsets: Dict[str, Dict[str, object]] = {}
|
||||||
|
entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
|
||||||
|
|
||||||
|
for synset in root.findall(".//Synset"):
|
||||||
|
synset_id = synset.attrib.get("id", "")
|
||||||
|
relations = [
|
||||||
|
{
|
||||||
|
"type": relation.attrib.get("relType", ""),
|
||||||
|
"target": relation.attrib.get("target", ""),
|
||||||
|
"subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
|
||||||
|
}
|
||||||
|
for relation in synset.findall("SynsetRelation")
|
||||||
|
]
|
||||||
|
synsets[synset_id] = {
|
||||||
|
"id": synset_id,
|
||||||
|
"ili": synset.attrib.get("ili", ""),
|
||||||
|
"definition": (synset.findtext("Definition") or "").strip(),
|
||||||
|
"relations": relations,
|
||||||
|
"lemmas": [],
|
||||||
|
"pos": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
for lexical_entry in root.findall(".//LexicalEntry"):
|
||||||
|
lemma = lexical_entry.find("Lemma")
|
||||||
|
if lemma is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
written_form = lemma.attrib.get("writtenForm", "").strip()
|
||||||
|
normalized_form = normalize_word(written_form)
|
||||||
|
if not normalized_form:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
|
||||||
|
senses = lexical_entry.findall("Sense")
|
||||||
|
sense_payloads = []
|
||||||
|
|
||||||
|
for sense in senses:
|
||||||
|
synset_id = sense.attrib.get("synset", "")
|
||||||
|
sense_id = sense.attrib.get("id", "")
|
||||||
|
if not synset_id or synset_id not in synsets:
|
||||||
|
continue
|
||||||
|
|
||||||
|
synsets[synset_id]["lemmas"].append(written_form)
|
||||||
|
synsets[synset_id]["pos"] = pos
|
||||||
|
sense_payloads.append(
|
||||||
|
{
|
||||||
|
"sense_id": sense_id,
|
||||||
|
"synset_id": synset_id,
|
||||||
|
"pos": pos,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
if sense_payloads:
|
||||||
|
entries_by_norm[normalized_form].append(
|
||||||
|
{
|
||||||
|
"written_form": written_form,
|
||||||
|
"normalized_form": normalized_form,
|
||||||
|
"pos": pos,
|
||||||
|
"senses": sense_payloads,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for synset in synsets.values():
|
||||||
|
unique_lemmas = []
|
||||||
|
seen = set()
|
||||||
|
for lemma in synset["lemmas"]:
|
||||||
|
if lemma not in seen:
|
||||||
|
seen.add(lemma)
|
||||||
|
unique_lemmas.append(lemma)
|
||||||
|
synset["lemmas"] = unique_lemmas
|
||||||
|
|
||||||
|
return synsets, entries_by_norm
|
||||||
|
|
||||||
|
|
||||||
|
def score_sense(
|
||||||
|
synset_id: str,
|
||||||
|
current_topics: Iterable[str],
|
||||||
|
synsets: Dict[str, Dict[str, object]],
|
||||||
|
) -> int:
|
||||||
|
synset = synsets.get(synset_id, {})
|
||||||
|
definition = str(synset.get("definition", ""))
|
||||||
|
inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
|
||||||
|
current_topics_set = set(str(topic) for topic in current_topics)
|
||||||
|
overlap = len(inferred_topics & current_topics_set)
|
||||||
|
return overlap * 10 + len(definition)
|
||||||
|
|
||||||
|
|
||||||
|
def best_candidate(
|
||||||
|
candidates: List[Dict[str, object]],
|
||||||
|
expected_pos: str,
|
||||||
|
current_topics: Iterable[str],
|
||||||
|
synsets: Dict[str, Dict[str, object]],
|
||||||
|
) -> Dict[str, object]:
|
||||||
|
ranked = []
|
||||||
|
for candidate in candidates:
|
||||||
|
pos_bonus = 100 if candidate["pos"] == expected_pos else 0
|
||||||
|
sense_bonus = 0
|
||||||
|
for sense in candidate.get("senses", []):
|
||||||
|
sense_bonus = max(
|
||||||
|
sense_bonus,
|
||||||
|
score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
|
||||||
|
)
|
||||||
|
ranked.append((pos_bonus + sense_bonus, candidate))
|
||||||
|
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||||
|
return ranked[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
|
||||||
|
seen = set()
|
||||||
|
result = []
|
||||||
|
for item in items:
|
||||||
|
text = str(item).strip()
|
||||||
|
if not text or text in seen:
|
||||||
|
continue
|
||||||
|
seen.add(text)
|
||||||
|
result.append(text)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_related_lemmas(
|
||||||
|
synset_ids: Iterable[str],
|
||||||
|
relation_type: str,
|
||||||
|
synsets: Dict[str, Dict[str, object]],
|
||||||
|
) -> List[str]:
|
||||||
|
related = []
|
||||||
|
for synset_id in synset_ids:
|
||||||
|
synset = synsets.get(synset_id)
|
||||||
|
if not synset:
|
||||||
|
continue
|
||||||
|
for relation in synset.get("relations", []):
|
||||||
|
if relation.get("type") != relation_type:
|
||||||
|
continue
|
||||||
|
target = relation.get("target", "")
|
||||||
|
target_synset = synsets.get(target)
|
||||||
|
if not target_synset:
|
||||||
|
continue
|
||||||
|
related.extend(target_synset.get("lemmas", []))
|
||||||
|
return dedupe_keep_order(related)
|
||||||
|
|
||||||
|
|
||||||
|
def collect_relation_terms(
|
||||||
|
synset_ids: Iterable[str],
|
||||||
|
relation_types: Iterable[str],
|
||||||
|
synsets: Dict[str, Dict[str, object]],
|
||||||
|
) -> Dict[str, List[str]]:
|
||||||
|
return {
|
||||||
|
relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
|
||||||
|
for relation_type in relation_types
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_entry(
|
||||||
|
entry: Dict[str, object],
|
||||||
|
synsets: Dict[str, Dict[str, object]],
|
||||||
|
entries_by_norm: Dict[str, List[Dict[str, object]]],
|
||||||
|
) -> Dict[str, object]:
|
||||||
|
normalized_candidates = dedupe_keep_order(
|
||||||
|
[
|
||||||
|
normalize_word(str(entry.get("form", ""))),
|
||||||
|
normalize_word(str(entry.get("lemma", ""))),
|
||||||
|
normalize_word(str(entry.get("normalized_form", ""))),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
matches: List[Dict[str, object]] = []
|
||||||
|
for candidate_key in normalized_candidates:
|
||||||
|
matches.extend(entries_by_norm.get(candidate_key, []))
|
||||||
|
|
||||||
|
if not matches:
|
||||||
|
entry["semantic"] = {
|
||||||
|
"source": "iwn-omw",
|
||||||
|
"matched": False,
|
||||||
|
"match_count": 0,
|
||||||
|
"synsets": [],
|
||||||
|
"synonyms": [],
|
||||||
|
"raw_relation_terms": {},
|
||||||
|
"glosses": [],
|
||||||
|
"semantic_topics": [],
|
||||||
|
}
|
||||||
|
return entry
|
||||||
|
|
||||||
|
selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
|
||||||
|
sorted_senses = sorted(
|
||||||
|
selected.get("senses", []),
|
||||||
|
key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
synset_ids = [sense["synset_id"] for sense in sorted_senses]
|
||||||
|
synset_payloads = []
|
||||||
|
glosses = []
|
||||||
|
synonyms = []
|
||||||
|
for synset_id in synset_ids[:3]:
|
||||||
|
synset = synsets.get(synset_id)
|
||||||
|
if not synset:
|
||||||
|
continue
|
||||||
|
gloss = str(synset.get("definition", "")).strip()
|
||||||
|
glosses.append(gloss)
|
||||||
|
synset_payloads.append(
|
||||||
|
{
|
||||||
|
"id": synset_id,
|
||||||
|
"pos": synset.get("pos", ""),
|
||||||
|
"definition": gloss,
|
||||||
|
"lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
|
||||||
|
"raw_relation_terms": collect_relation_terms(
|
||||||
|
[synset_id],
|
||||||
|
("hypernym", "hyponym", "similar"),
|
||||||
|
synsets,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
synonyms.extend(synset.get("lemmas", []))
|
||||||
|
|
||||||
|
raw_relation_terms = collect_relation_terms(
|
||||||
|
synset_ids,
|
||||||
|
("hypernym", "hyponym", "similar"),
|
||||||
|
synsets,
|
||||||
|
)
|
||||||
|
synonyms = [
|
||||||
|
lemma
|
||||||
|
for lemma in dedupe_keep_order(synonyms)
|
||||||
|
if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
|
||||||
|
][:20]
|
||||||
|
glosses = dedupe_keep_order(glosses)
|
||||||
|
semantic_topics = dedupe_keep_order(
|
||||||
|
list(entry.get("topics", []))
|
||||||
|
+ semantic_topics_from_text(
|
||||||
|
glosses
|
||||||
|
+ synonyms
|
||||||
|
+ raw_relation_terms.get("hypernym", [])
|
||||||
|
+ raw_relation_terms.get("hyponym", [])
|
||||||
|
+ raw_relation_terms.get("similar", [])
|
||||||
|
)
|
||||||
|
)
|
||||||
|
entry["topics"] = dedupe_keep_order(list(entry.get("topics", [])) + semantic_topics)
|
||||||
|
entry["semantic"] = {
|
||||||
|
"source": "iwn-omw",
|
||||||
|
"matched": True,
|
||||||
|
"match_count": len(matches),
|
||||||
|
"selected_form": selected.get("written_form", ""),
|
||||||
|
"synsets": synset_payloads,
|
||||||
|
"synonyms": synonyms,
|
||||||
|
"raw_relation_terms": raw_relation_terms,
|
||||||
|
"glosses": glosses,
|
||||||
|
"semantic_topics": semantic_topics,
|
||||||
|
}
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
def build_semantic_lexicon() -> Dict[str, object]:
|
||||||
|
if not LEXICON_OUTPUT_PATH.exists():
|
||||||
|
raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
|
||||||
|
if not IWN_XML_PATH.exists():
|
||||||
|
raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
|
||||||
|
|
||||||
|
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||||
|
synsets, entries_by_norm = parse_iwn()
|
||||||
|
|
||||||
|
enriched_entries = []
|
||||||
|
for entry in payload.get("entries", []):
|
||||||
|
enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
|
||||||
|
|
||||||
|
return {
|
||||||
|
"meta": {
|
||||||
|
"language": "it",
|
||||||
|
"version": 1,
|
||||||
|
"base_lexicon": str(LEXICON_OUTPUT_PATH.name),
|
||||||
|
"sources": [
|
||||||
|
"lexicon_it.json",
|
||||||
|
"iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
|
||||||
|
],
|
||||||
|
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||||
|
"entry_count": len(enriched_entries),
|
||||||
|
"semantic_source": "IWN-OMW v1.0",
|
||||||
|
},
|
||||||
|
"entries": enriched_entries,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
payload = build_semantic_lexicon()
|
||||||
|
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||||
|
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
|
||||||
|
print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
||||||
|
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||||
|
print(f"Voci con match semantico: {matched}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
55
create_passo4.bat
Normal file
55
create_passo4.bat
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
@echo off
|
||||||
|
setlocal
|
||||||
|
|
||||||
|
cd /d "%~dp0"
|
||||||
|
|
||||||
|
set "BRANCH_NAME=passo4"
|
||||||
|
set "COMMIT_MSG=feat: aggiunge il lessico semantico con integrazione ItalWordNet"
|
||||||
|
|
||||||
|
if not "%~1"=="" (
|
||||||
|
set "COMMIT_MSG=%~1"
|
||||||
|
)
|
||||||
|
|
||||||
|
echo Repository: %cd%
|
||||||
|
echo Branch target: %BRANCH_NAME%
|
||||||
|
echo Commit message: %COMMIT_MSG%
|
||||||
|
echo.
|
||||||
|
|
||||||
|
git rev-parse --is-inside-work-tree >nul 2>nul
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo Errore: questa cartella non e' un repository Git.
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo Creo il branch %BRANCH_NAME%...
|
||||||
|
git checkout -b %BRANCH_NAME%
|
||||||
|
) else (
|
||||||
|
echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
|
||||||
|
git checkout %BRANCH_NAME%
|
||||||
|
)
|
||||||
|
if errorlevel 1 exit /b 1
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Aggiungo le modifiche...
|
||||||
|
git add .
|
||||||
|
if errorlevel 1 exit /b 1
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Creo il commit...
|
||||||
|
git commit -m "%COMMIT_MSG%"
|
||||||
|
if errorlevel 1 (
|
||||||
|
echo.
|
||||||
|
echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Eseguo il push del branch %BRANCH_NAME%...
|
||||||
|
git push -u origin %BRANCH_NAME%
|
||||||
|
if errorlevel 1 exit /b 1
|
||||||
|
|
||||||
|
echo.
|
||||||
|
echo Operazione completata con successo.
|
||||||
|
endlocal
|
||||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||||
@@ -72,6 +73,7 @@ class CrosswordFiller:
|
|||||||
*,
|
*,
|
||||||
target_empty_ratio: float = TARGET_EMPTY_RATIO,
|
target_empty_ratio: float = TARGET_EMPTY_RATIO,
|
||||||
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
||||||
|
seed: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.state = state.copy()
|
self.state = state.copy()
|
||||||
self.initial_state = state.copy()
|
self.initial_state = state.copy()
|
||||||
@@ -81,6 +83,8 @@ class CrosswordFiller:
|
|||||||
self.vocabulary = self._normalize_vocabulary(vocabulary)
|
self.vocabulary = self._normalize_vocabulary(vocabulary)
|
||||||
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
||||||
self.vocabulary_metadata = vocabulary_metadata or {}
|
self.vocabulary_metadata = vocabulary_metadata or {}
|
||||||
|
self.seed = seed
|
||||||
|
self.rng = random.Random(seed)
|
||||||
self.bounds = self._compute_bounds(self.state.grid)
|
self.bounds = self._compute_bounds(self.state.grid)
|
||||||
self.total_cells = self._area(self.bounds)
|
self.total_cells = self._area(self.bounds)
|
||||||
self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
|
self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
|
||||||
@@ -181,6 +185,10 @@ class CrosswordFiller:
|
|||||||
|
|
||||||
collected = list(unique.values())
|
collected = list(unique.values())
|
||||||
collected.sort(key=self._slot_priority, reverse=True)
|
collected.sort(key=self._slot_priority, reverse=True)
|
||||||
|
if len(collected) > 1:
|
||||||
|
top_slice = collected[: min(MAX_SLOT_CANDIDATES, len(collected))]
|
||||||
|
self.rng.shuffle(top_slice)
|
||||||
|
collected = top_slice + collected[min(MAX_SLOT_CANDIDATES, len(collected)) :]
|
||||||
return collected
|
return collected
|
||||||
|
|
||||||
def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
|
def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
|
||||||
@@ -292,7 +300,7 @@ class CrosswordFiller:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
candidates.sort(key=lambda item: item.local_score, reverse=True)
|
candidates.sort(key=lambda item: item.local_score, reverse=True)
|
||||||
return candidates[0]
|
return self.rng.choice(candidates[: min(3, len(candidates))])
|
||||||
|
|
||||||
def _word_quality(self, word: str) -> int:
|
def _word_quality(self, word: str) -> int:
|
||||||
metadata = self.vocabulary_metadata.get(word)
|
metadata = self.vocabulary_metadata.get(word)
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
import locale
|
import locale
|
||||||
|
import random
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||||
@@ -114,6 +115,7 @@ class CrosswordGenerator:
|
|||||||
max_candidates_per_word: int = 12,
|
max_candidates_per_word: int = 12,
|
||||||
time_limit_seconds: float = 8.0,
|
time_limit_seconds: float = 8.0,
|
||||||
diffxy: int = DIFFXY,
|
diffxy: int = DIFFXY,
|
||||||
|
seed: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
normalized = [self._normalize(word) for word in words]
|
normalized = [self._normalize(word) for word in words]
|
||||||
unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
|
unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
|
||||||
@@ -122,6 +124,8 @@ class CrosswordGenerator:
|
|||||||
self.max_candidates_per_word = max_candidates_per_word
|
self.max_candidates_per_word = max_candidates_per_word
|
||||||
self.time_limit_seconds = time_limit_seconds
|
self.time_limit_seconds = time_limit_seconds
|
||||||
self.diffxy = diffxy
|
self.diffxy = diffxy
|
||||||
|
self.seed = seed
|
||||||
|
self.rng = random.Random(seed)
|
||||||
self.started_at = 0.0
|
self.started_at = 0.0
|
||||||
self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
|
self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
|
||||||
self.nodes_visited = 0
|
self.nodes_visited = 0
|
||||||
@@ -213,6 +217,8 @@ class CrosswordGenerator:
|
|||||||
reverse=True,
|
reverse=True,
|
||||||
)
|
)
|
||||||
candidates = candidates[: self.max_candidates_per_word]
|
candidates = candidates[: self.max_candidates_per_word]
|
||||||
|
if len(candidates) > 1:
|
||||||
|
self.rng.shuffle(candidates)
|
||||||
|
|
||||||
next_remaining = [word for word in remaining_words if word != next_word]
|
next_remaining = [word for word in remaining_words if word != next_word]
|
||||||
for placement in candidates:
|
for placement in candidates:
|
||||||
@@ -253,6 +259,10 @@ class CrosswordGenerator:
|
|||||||
word,
|
word,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
if len(ranked_words) > 1:
|
||||||
|
top_slice = ranked_words[: min(5, len(ranked_words))]
|
||||||
|
self.rng.shuffle(top_slice)
|
||||||
|
ranked_words = top_slice + ranked_words[min(5, len(ranked_words)) :]
|
||||||
|
|
||||||
best_word = ranked_words[0]
|
best_word = ranked_words[0]
|
||||||
best_key: Optional[Tuple[int, int, int, str]] = None
|
best_key: Optional[Tuple[int, int, int, str]] = None
|
||||||
|
|||||||
BIN
iwn-omw-main.zip
Normal file
BIN
iwn-omw-main.zip
Normal file
Binary file not shown.
21
iwn-omw-main/IWN-OMW-main/README.md
Normal file
21
iwn-omw-main/IWN-OMW-main/README.md
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# IWN-OMW
|
||||||
|
This is the repository for the Open Italian WordNet, i.e. ItalWordNet versions compliant with the Open Multilingual WordNet guidelines and initiative.
|
||||||
|
|
||||||
|
IWN-OMW is a new LMF version of the ItalWordNet resource converted and formatted according to the guidelines and requirements defined by the Open Multilingual Wordnet initiative (OMW, https://omwn.org/). This current version is derived from the ItalWordNet v.2. (IWN) database (http://hdl.handle.net/20.500.11752/ILC-62).
|
||||||
|
|
||||||
|
NB: 'dc:relation', when used, contains links to equivalent Senses in the RDF version of the SIMPLE Italian lexiconù
|
||||||
|
|
||||||
|
## Licence
|
||||||
|
|
||||||
|
CC-BY-SA 4.0
|
||||||
|
|
||||||
|
## Citation
|
||||||
|
|
||||||
|
If you use this resource please cite:
|
||||||
|
|
||||||
|
Quochi, Valeria, Roberto Bartolini, and Monica Monachini (to appear) ‘ItalwordNet goes open´. *LiLT Special Issues on Open Multilingual
|
||||||
|
WordNets*. CSLI Publications.
|
||||||
|
|
||||||
|
And
|
||||||
|
|
||||||
|
Roventini, Adriana, Antonietta Alonge, Francesca Bertagna, Nicoletta Calzolari, J. Cancila, C. Girardi, Bernardo Magnini, Rita Marinelli, Manuela Speranza, and Antonio Zampolli (2003) "ItalwordNet: building a large semantic database for the automatic treatment of Italian". *Linguistica Computazionale* 18-19:745-791.
|
||||||
487155
iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
Normal file
487155
iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
Normal file
File diff suppressed because it is too large
Load Diff
433535
lexicon_it.json
Normal file
433535
lexicon_it.json
Normal file
File diff suppressed because it is too large
Load Diff
1333962
lexicon_it_semantic.json
Normal file
1333962
lexicon_it_semantic.json
Normal file
File diff suppressed because it is too large
Load Diff
149
main.py
149
main.py
@@ -1,8 +1,9 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import Dict, List
|
||||||
|
|
||||||
from build_vocabulary import (
|
from build_vocabulary import (
|
||||||
FILTERED_OUTPUT_PATH,
|
FILTERED_OUTPUT_PATH,
|
||||||
@@ -10,39 +11,61 @@ from build_vocabulary import (
|
|||||||
OUTPUT_PATH,
|
OUTPUT_PATH,
|
||||||
build_vocabulary,
|
build_vocabulary,
|
||||||
)
|
)
|
||||||
|
from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
|
||||||
|
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
|
||||||
from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
|
from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
|
||||||
from crossword_generator import CrosswordGenerator, WORDS, render_grid
|
from crossword_generator import CrosswordGenerator, WORDS, render_grid
|
||||||
|
|
||||||
|
|
||||||
|
DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||||
|
"easy": 1,
|
||||||
|
"medium": 2,
|
||||||
|
"hard": 4,
|
||||||
|
"expert": 5,
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_TOPIC = "general"
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
|
parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--build-vocabulary",
|
"--build-vocabulary",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Rigenera il vocabolario esteso, filtrato e i metadati prima dell'esecuzione.",
|
help="Rigenera i file lessicali intermedi: vocabolario esteso, filtrato e metadati.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--build-lexicon",
|
||||||
|
action="store_true",
|
||||||
|
help="Rigenera `lexicon_it.json` prima dell'esecuzione.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--skip-fill",
|
"--skip-fill",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Genera solo la griglia iniziale senza eseguire il filler.",
|
help="Genera solo la griglia iniziale e salta il riempimento con il filler.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--build-semantic-lexicon",
|
||||||
|
action="store_true",
|
||||||
|
help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--vocabulary",
|
"--vocabulary",
|
||||||
type=Path,
|
type=Path,
|
||||||
default=None,
|
default=None,
|
||||||
help="Percorso opzionale a un vocabolario personalizzato.",
|
help="Percorso opzionale a un vocabolario testuale personalizzato da usare al posto di quello di default.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--target-empty-ratio",
|
"--target-empty-ratio",
|
||||||
type=float,
|
type=float,
|
||||||
default=1 / 6,
|
default=1 / 6,
|
||||||
help="Rapporto target di celle vuote residue dopo il filler.",
|
help="Rapporto target di celle vuote residue dopo il filler. Esempio: 0.1667 lascia circa un sesto di celle vuote.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--time-limit",
|
"--time-limit",
|
||||||
type=float,
|
type=float,
|
||||||
default=8.0,
|
default=8.0,
|
||||||
help="Tempo massimo in secondi per la fase di generazione iniziale.",
|
help="Tempo massimo in secondi per la fase di generazione iniziale della griglia.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-candidates",
|
"--max-candidates",
|
||||||
@@ -54,7 +77,23 @@ def parse_args() -> argparse.Namespace:
|
|||||||
"--diffxy",
|
"--diffxy",
|
||||||
type=int,
|
type=int,
|
||||||
default=7,
|
default=7,
|
||||||
help="Differenza massima preferita tra larghezza e altezza della griglia.",
|
help="Differenza massima preferita tra larghezza e altezza della griglia iniziale.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seed",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Seed casuale per ottenere varianti riproducibili del cruciverba: stesso seed, stesso risultato.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--difficulty",
|
||||||
|
default="medium",
|
||||||
|
help="Difficolta lessicale del filler. Alias testuali: easy, medium, hard, expert. Internamente mappati a livelli numerici 1-5.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--topic",
|
||||||
|
default=DEFAULT_TOPIC,
|
||||||
|
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
|
||||||
)
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
@@ -73,21 +112,110 @@ def ensure_vocabulary(args: argparse.Namespace) -> None:
|
|||||||
print(f"- parole filtrate: {totals['filtered_words']}")
|
print(f"- parole filtrate: {totals['filtered_words']}")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_lexicon(args: argparse.Namespace) -> None:
|
||||||
|
needs_build = args.build_lexicon or not LEXICON_OUTPUT_PATH.exists()
|
||||||
|
if not needs_build:
|
||||||
|
return
|
||||||
|
|
||||||
|
lexicon = build_lexicon()
|
||||||
|
LEXICON_OUTPUT_PATH.write_text(
|
||||||
|
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
print("Lessico rigenerato")
|
||||||
|
print(f"- file: {LEXICON_OUTPUT_PATH}")
|
||||||
|
print(f"- voci: {lexicon['meta']['entry_count']}")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
|
||||||
|
needs_build = args.build_semantic_lexicon or not SEMANTIC_LEXICON_OUTPUT_PATH.exists()
|
||||||
|
if not needs_build:
|
||||||
|
return
|
||||||
|
|
||||||
|
lexicon = build_semantic_lexicon()
|
||||||
|
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||||
|
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
matched = sum(1 for entry in lexicon["entries"] if entry.get("semantic", {}).get("matched"))
|
||||||
|
print("Lessico semantico rigenerato")
|
||||||
|
print(f"- file: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
||||||
|
print(f"- voci: {lexicon['meta']['entry_count']}")
|
||||||
|
print(f"- match semantici: {matched}")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_difficulty(value: str) -> int:
|
||||||
|
text = str(value).strip().lower()
|
||||||
|
if text in DIFFICULTY_ALIASES:
|
||||||
|
return DIFFICULTY_ALIASES[text]
|
||||||
|
try:
|
||||||
|
level = int(text)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise SystemExit(
|
||||||
|
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
|
||||||
|
) from exc
|
||||||
|
if not 1 <= level <= 5:
|
||||||
|
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
|
||||||
|
return level
|
||||||
|
|
||||||
|
|
||||||
def load_selected_vocabulary(path: Path | None) -> List[str]:
|
def load_selected_vocabulary(path: Path | None) -> List[str]:
|
||||||
if path is None:
|
if path is None:
|
||||||
return load_vocabulary()
|
return load_vocabulary()
|
||||||
return path.read_text(encoding="utf-8").splitlines()
|
return path.read_text(encoding="utf-8").splitlines()
|
||||||
|
|
||||||
|
|
||||||
|
def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
|
||||||
|
if not LEXICON_OUTPUT_PATH.exists():
|
||||||
|
lexicon = build_lexicon()
|
||||||
|
LEXICON_OUTPUT_PATH.write_text(
|
||||||
|
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
|
||||||
|
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||||
|
normalized_topic = topic.strip().lower()
|
||||||
|
|
||||||
|
def matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||||
|
topics = [str(item).lower() for item in entry.get("topics", [])]
|
||||||
|
return selected_topic in topics
|
||||||
|
|
||||||
|
words = [
|
||||||
|
entry["form"]
|
||||||
|
for entry in payload.get("entries", [])
|
||||||
|
if entry.get("allowed_in_crossword", False)
|
||||||
|
and int(entry.get("difficulty_word", 5)) <= level
|
||||||
|
and matches(entry, normalized_topic)
|
||||||
|
]
|
||||||
|
|
||||||
|
if words:
|
||||||
|
return words
|
||||||
|
|
||||||
|
if normalized_topic != DEFAULT_TOPIC:
|
||||||
|
return [
|
||||||
|
entry["form"]
|
||||||
|
for entry in payload.get("entries", [])
|
||||||
|
if entry.get("allowed_in_crossword", False)
|
||||||
|
and int(entry.get("difficulty_word", 5)) <= level
|
||||||
|
and matches(entry, DEFAULT_TOPIC)
|
||||||
|
]
|
||||||
|
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
ensure_vocabulary(args)
|
ensure_vocabulary(args)
|
||||||
|
ensure_lexicon(args)
|
||||||
|
ensure_semantic_lexicon(args)
|
||||||
|
difficulty_level = parse_difficulty(args.difficulty)
|
||||||
|
|
||||||
generator = CrosswordGenerator(
|
generator = CrosswordGenerator(
|
||||||
WORDS,
|
WORDS,
|
||||||
diffxy=args.diffxy,
|
diffxy=args.diffxy,
|
||||||
time_limit_seconds=args.time_limit,
|
time_limit_seconds=args.time_limit,
|
||||||
max_candidates_per_word=args.max_candidates,
|
max_candidates_per_word=args.max_candidates,
|
||||||
|
seed=args.seed,
|
||||||
)
|
)
|
||||||
initial_state = generator.solve()
|
initial_state = generator.solve()
|
||||||
|
|
||||||
@@ -95,19 +223,24 @@ def main() -> None:
|
|||||||
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
|
print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
|
||||||
print(f"Intersezioni: {initial_state.intersections}")
|
print(f"Intersezioni: {initial_state.intersections}")
|
||||||
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
||||||
|
print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
|
||||||
|
print(f"Tema filler: {args.topic}")
|
||||||
|
if args.seed is not None:
|
||||||
|
print(f"Seed: {args.seed}")
|
||||||
print()
|
print()
|
||||||
print(render_grid(initial_state.grid, initial_state.placements))
|
print(render_grid(initial_state.grid, initial_state.placements))
|
||||||
|
|
||||||
if args.skip_fill:
|
if args.skip_fill:
|
||||||
return
|
return
|
||||||
|
|
||||||
vocabulary = load_selected_vocabulary(args.vocabulary)
|
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
|
||||||
metadata = load_vocabulary_metadata()
|
metadata = load_vocabulary_metadata()
|
||||||
filler = CrosswordFiller(
|
filler = CrosswordFiller(
|
||||||
initial_state,
|
initial_state,
|
||||||
vocabulary,
|
vocabulary,
|
||||||
target_empty_ratio=args.target_empty_ratio,
|
target_empty_ratio=args.target_empty_ratio,
|
||||||
vocabulary_metadata=metadata,
|
vocabulary_metadata=metadata,
|
||||||
|
seed=args.seed,
|
||||||
)
|
)
|
||||||
final_state = filler.fill()
|
final_state = filler.fill()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user