alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
324
build_enriched_lexicon.py
Normal file
324
build_enriched_lexicon.py
Normal file
@@ -0,0 +1,324 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from build_babelnet_enrichment import BABELNET_OUTPUT_PATH
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
ENRICHED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_enriched.json")
|
||||
|
||||
TOPIC_DOMAIN_RULES: Dict[str, Dict[str, Tuple[str, ...]]] = {
|
||||
"transport": {
|
||||
"strong": (
|
||||
"TRANSPORT_AND_TRAVEL",
|
||||
"NAVIGATION_AND_AVIATION",
|
||||
),
|
||||
"weak": (
|
||||
"CRAFT_ENGINEERING_AND_TECHNOLOGY",
|
||||
"FARMING_FISHING_AND_HUNTING",
|
||||
),
|
||||
"negative": (
|
||||
"MEDIA_AND_PRESS",
|
||||
"PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR",
|
||||
"RELIGION_MYSTICISM_AND_MYTHOLOGY",
|
||||
"CHEMISTRY_AND_MINERALOGY",
|
||||
),
|
||||
},
|
||||
"health": {
|
||||
"strong": ("HEALTH_AND_MEDICINE",),
|
||||
"weak": ("BIOLOGY",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"cinema": {
|
||||
"strong": ("MEDIA_AND_PRESS",),
|
||||
"weak": ("ART_ARCHITECTURE_AND_ARCHAEOLOGY",),
|
||||
"negative": ("HEALTH_AND_MEDICINE", "CHEMISTRY_AND_MINERALOGY"),
|
||||
},
|
||||
"nature": {
|
||||
"strong": (
|
||||
"BIOLOGY",
|
||||
"ANIMALS",
|
||||
"PLANTS",
|
||||
"EARTH",
|
||||
"METEOROLOGY",
|
||||
),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"ecology": {
|
||||
"strong": ("BIOLOGY", "EARTH", "METEOROLOGY"),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
}
|
||||
|
||||
TOPIC_TEXT_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"aereo",
|
||||
"auto",
|
||||
"autobus",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"trasport",
|
||||
"treno",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"health": ("cura", "malato", "medic", "ospedale", "paziente", "salute", "soccorso"),
|
||||
"cinema": ("attore", "cinema", "film", "pellicola", "regia", "spettacolo"),
|
||||
"nature": ("acqua", "animale", "bosco", "fiore", "mare", "montagna", "pianta", "terra"),
|
||||
"ecology": ("ambiente", "ecologia", "inquinamento", "natura", "sostenibile"),
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fonde lexicon_it_semantic.json con gli arricchimenti BabelNet gia disponibili."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="File con arricchimenti BabelNet parziali.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico arricchito da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale da usare per scegliere il synset BabelNet migliore.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def topic_candidates(entry: Dict[str, object], requested_topic: Optional[str]) -> List[str]:
|
||||
topics = [str(topic).lower() for topic in entry.get("topics", []) if topic]
|
||||
if requested_topic:
|
||||
topics.insert(0, requested_topic.lower())
|
||||
return [topic for topic in dedupe(topics) if topic != "general"]
|
||||
|
||||
|
||||
def synset_text(synset: Dict[str, object]) -> str:
|
||||
fields = []
|
||||
fields.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
return " ".join(fields).lower()
|
||||
|
||||
|
||||
def score_synset_for_topic(synset: Dict[str, object], topic: str) -> int:
|
||||
score = 0
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
|
||||
score += 60 * len(domains.intersection(rules.get("strong", ())))
|
||||
score += 25 * len(domains.intersection(rules.get("weak", ())))
|
||||
score -= 35 * len(domains.intersection(rules.get("negative", ())))
|
||||
|
||||
text = synset_text(synset)
|
||||
for keyword in TOPIC_TEXT_KEYWORDS.get(topic, ()):
|
||||
if keyword in text:
|
||||
score += 12
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_synset(
|
||||
babelnet: Dict[str, object], entry: Dict[str, object], requested_topic: Optional[str]
|
||||
) -> Tuple[Optional[Dict[str, object]], Dict[str, int]]:
|
||||
synsets = [item for item in babelnet.get("synsets", []) or [] if isinstance(item, dict)]
|
||||
topics = topic_candidates(entry, requested_topic)
|
||||
if not synsets:
|
||||
return None, {}
|
||||
|
||||
if not topics:
|
||||
best_synset = synsets[0]
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": None,
|
||||
"topic_score": 0,
|
||||
"strong_topic": False,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, {}
|
||||
|
||||
topic_scores: Dict[str, int] = {}
|
||||
best_synset = None
|
||||
best_topic = None
|
||||
best_score = -10_000
|
||||
|
||||
for topic in topics:
|
||||
topic_best = max(score_synset_for_topic(synset, topic) for synset in synsets)
|
||||
topic_scores[topic] = topic_best
|
||||
for synset in synsets:
|
||||
score = score_synset_for_topic(synset, topic)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_topic = topic
|
||||
best_synset = synset
|
||||
|
||||
if not best_synset:
|
||||
return None, topic_scores
|
||||
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": best_topic,
|
||||
"topic_score": best_score,
|
||||
"strong_topic": best_score >= 40,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, topic_scores
|
||||
|
||||
|
||||
def normalize_babelnet_status(
|
||||
entry: Dict[str, object], babelnet_entry: Optional[Dict[str, object]], requested_topic: Optional[str]
|
||||
) -> Dict[str, object]:
|
||||
if not babelnet_entry:
|
||||
return {"status": "not_requested"}
|
||||
|
||||
raw_babelnet = babelnet_entry.get("babelnet", {})
|
||||
if not isinstance(raw_babelnet, dict):
|
||||
return {"status": "api_error", "reason": "invalid_babelnet_payload"}
|
||||
|
||||
if not raw_babelnet.get("matched"):
|
||||
return {
|
||||
"status": "no_match",
|
||||
"matched": False,
|
||||
"reason": raw_babelnet.get("reason", "no_synsets"),
|
||||
"synsets": [],
|
||||
}
|
||||
|
||||
best_synset, topic_scores = choose_best_synset(raw_babelnet, entry, requested_topic)
|
||||
status = "enriched"
|
||||
if best_synset and int(best_synset.get("topic_score", 0)) <= 0:
|
||||
status = "ambiguous"
|
||||
selected_synset_id = best_synset.get("id") if best_synset else None
|
||||
selected_topic = best_synset.get("topic") if best_synset else None
|
||||
topic_score = int(best_synset.get("topic_score", 0)) if best_synset else 0
|
||||
strong_topic = bool(best_synset.get("strong_topic", False)) if best_synset else False
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"matched": True,
|
||||
"selected_synset_id": selected_synset_id,
|
||||
"selected_topic": selected_topic,
|
||||
"topic_score": topic_score,
|
||||
"strong_topic": strong_topic,
|
||||
"synset_refs": raw_babelnet.get("synset_refs", []),
|
||||
"synsets": raw_babelnet.get("synsets", []),
|
||||
"topic_scores": topic_scores,
|
||||
"best_synset": best_synset,
|
||||
"source_generated_at": babelnet_entry.get("babelnet_generated_at"),
|
||||
}
|
||||
|
||||
|
||||
def build_babelnet_index(payload: Dict[str, object]) -> Dict[Tuple[str, str], Dict[str, object]]:
|
||||
index = {}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
index[entry_key(entry)] = entry
|
||||
return index
|
||||
|
||||
|
||||
def build_enriched_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
semantic_payload = load_json(args.semantic, {})
|
||||
if not isinstance(semantic_payload, dict) or "entries" not in semantic_payload:
|
||||
raise ValueError(f"Lessico semantico non valido: {args.semantic}")
|
||||
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
babelnet_index = build_babelnet_index(babelnet_payload)
|
||||
enriched_entries = []
|
||||
status_counts: Dict[str, int] = {}
|
||||
|
||||
for entry in semantic_payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
enriched = deepcopy(entry)
|
||||
babelnet_entry = babelnet_index.get(entry_key(enriched))
|
||||
enriched["babelnet"] = normalize_babelnet_status(enriched, babelnet_entry, args.topic)
|
||||
status = str(enriched["babelnet"].get("status", "unknown"))
|
||||
status_counts[status] = status_counts.get(status, 0) + 1
|
||||
enriched_entries.append(enriched)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": semantic_payload.get("meta", {}).get("language", "it"),
|
||||
"version": 1,
|
||||
"base_lexicon": args.semantic.name,
|
||||
"babelnet_source": args.babelnet.name if args.babelnet.exists() else None,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"requested_topic": args.topic,
|
||||
"entry_count": len(enriched_entries),
|
||||
"babelnet_status_counts": status_counts,
|
||||
},
|
||||
"entries": enriched_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_enriched_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico arricchito generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Stati BabelNet: {payload['meta']['babelnet_status_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user