alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
473
refine_lexicon_topics.py
Normal file
473
refine_lexicon_topics.py
Normal file
@@ -0,0 +1,473 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REFINED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined.json")
|
||||
|
||||
TOPIC_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"religion": (
|
||||
"abbazia",
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"cappella",
|
||||
"cardinale",
|
||||
"chiesa",
|
||||
"clero",
|
||||
"convento",
|
||||
"diocesi",
|
||||
"ecclesiast",
|
||||
"fede",
|
||||
"frate",
|
||||
"mistica",
|
||||
"monaco",
|
||||
"monastero",
|
||||
"parrocchia",
|
||||
"prete",
|
||||
"religion",
|
||||
"sacerdot",
|
||||
"santo",
|
||||
"vescovo",
|
||||
),
|
||||
"ecclesiastical_hierarchy": (
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"carica ecclesiastica",
|
||||
"cardinale",
|
||||
"clero",
|
||||
"dignità ecclesiastica",
|
||||
"ecclesiast",
|
||||
"ordinazione",
|
||||
"parroco",
|
||||
"patriarca",
|
||||
"pontefice",
|
||||
"prete",
|
||||
"priore",
|
||||
"superiore del monastero",
|
||||
"vescovo",
|
||||
),
|
||||
"honorific_title": (
|
||||
"carica",
|
||||
"epiteto",
|
||||
"nobile",
|
||||
"onore",
|
||||
"onorific",
|
||||
"titolo",
|
||||
),
|
||||
"mysticism": (
|
||||
"asceta",
|
||||
"contemplazione",
|
||||
"estasi",
|
||||
"mistica",
|
||||
"mistico",
|
||||
"monachesimo",
|
||||
"spiritual",
|
||||
),
|
||||
"geography": (
|
||||
"borgo",
|
||||
"città",
|
||||
"comune",
|
||||
"frazione",
|
||||
"geografia",
|
||||
"isola",
|
||||
"località",
|
||||
"paese",
|
||||
"provincia",
|
||||
"regione",
|
||||
"stato",
|
||||
"toponimo",
|
||||
"valle",
|
||||
),
|
||||
"transport": (
|
||||
"aereo",
|
||||
"aeroplano",
|
||||
"auto",
|
||||
"autobus",
|
||||
"autocarro",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"locomotiva",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"porto",
|
||||
"stazione",
|
||||
"traghetto",
|
||||
"treno",
|
||||
"trasport",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"nature": (
|
||||
"acqua",
|
||||
"albero",
|
||||
"animale",
|
||||
"bosco",
|
||||
"fiore",
|
||||
"fiume",
|
||||
"foresta",
|
||||
"mare",
|
||||
"montagna",
|
||||
"natura",
|
||||
"pianta",
|
||||
"terra",
|
||||
),
|
||||
"health": (
|
||||
"ambulanza",
|
||||
"anemia",
|
||||
"cura",
|
||||
"farmaco",
|
||||
"malattia",
|
||||
"medic",
|
||||
"ospedale",
|
||||
"paziente",
|
||||
"salute",
|
||||
"soccorso",
|
||||
"terapia",
|
||||
),
|
||||
"war": (
|
||||
"arma",
|
||||
"artiglieria",
|
||||
"assalto",
|
||||
"battaglia",
|
||||
"bombard",
|
||||
"esercito",
|
||||
"fortezza",
|
||||
"guerra",
|
||||
"militare",
|
||||
"soldato",
|
||||
"trincea",
|
||||
),
|
||||
}
|
||||
|
||||
TAG_STOPWORDS = {
|
||||
"and",
|
||||
"con",
|
||||
"da",
|
||||
"dei",
|
||||
"del",
|
||||
"della",
|
||||
"delle",
|
||||
"dello",
|
||||
"di",
|
||||
"e",
|
||||
"il",
|
||||
"in",
|
||||
"la",
|
||||
"le",
|
||||
"lo",
|
||||
"nel",
|
||||
"nella",
|
||||
"per",
|
||||
"su",
|
||||
"the",
|
||||
"un",
|
||||
"una",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Genera un lessico raffinato con campi aggiuntivi per topic, tag semantici e sensi."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="File lessicale di partenza, tipicamente lexicon_it_enriched.json.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Nuovo file lessicale raffinato da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace-general",
|
||||
action="store_true",
|
||||
help="Se attivo, sostituisce topic=['general'] con i topic suggeriti quando la confidenza e alta.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-topic-score",
|
||||
type=int,
|
||||
default=40,
|
||||
help="Punteggio minimo per promuovere un topic suggerito nei topics finali.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def slugify_tag(text: str) -> str:
|
||||
value = re.sub(r"[^a-z0-9]+", "_", text.strip().lower(), flags=re.IGNORECASE)
|
||||
value = value.strip("_")
|
||||
return value
|
||||
|
||||
|
||||
def cleanup_tag(tag: str) -> str:
|
||||
normalized = slugify_tag(tag)
|
||||
if not normalized or normalized in TAG_STOPWORDS or len(normalized) <= 1:
|
||||
return ""
|
||||
return normalized
|
||||
|
||||
|
||||
def flatten_text(entry: Dict[str, object]) -> str:
|
||||
chunks: List[str] = []
|
||||
chunks.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
chunks.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
chunks.extend(str(gloss) for gloss in semantic.get("glosses", []) or [])
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.append(str(synset.get("definition", "")))
|
||||
chunks.extend(str(item) for item in synset.get("lemmas", []) or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
chunks.extend(str(item) for item in babelnet.get("synset_refs", []) or [])
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
chunks.extend(str(item) for item in best_synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("senses", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
|
||||
return " ".join(chunks).lower()
|
||||
|
||||
|
||||
def infer_topic_scores(entry: Dict[str, object]) -> Dict[str, int]:
|
||||
text = flatten_text(entry)
|
||||
scores: Dict[str, int] = {}
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
occurrences = text.count(keyword.lower())
|
||||
if occurrences:
|
||||
score += 12 * occurrences
|
||||
if score:
|
||||
scores[topic] = min(score, 100)
|
||||
return scores
|
||||
|
||||
|
||||
def collect_semantic_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
tags.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
tags.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
for relation_group in (semantic.get("raw_relation_terms", {}) or {}).values():
|
||||
tags.extend(str(item) for item in relation_group or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
tags.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
tags.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
|
||||
cleaned = [cleanup_tag(tag) for tag in tags]
|
||||
return [tag for tag in dedupe(cleaned) if tag]
|
||||
|
||||
|
||||
def collect_senses(entry: Dict[str, object], topic_scores: Dict[str, int]) -> List[Dict[str, object]]:
|
||||
senses: List[Dict[str, object]] = []
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
definition = str(synset.get("definition", "")).strip()
|
||||
if not definition:
|
||||
continue
|
||||
senses.append(
|
||||
{
|
||||
"source": "semantic",
|
||||
"id": synset.get("id"),
|
||||
"definition": definition,
|
||||
"lemmas": dedupe(str(item) for item in synset.get("lemmas", []) or []),
|
||||
"topics": dedupe(
|
||||
list(semantic.get("semantic_topics", []) or [])
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": 0.7,
|
||||
}
|
||||
)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict) and best_synset.get("id"):
|
||||
glosses = [str(item).strip() for item in best_synset.get("glosses", []) or [] if str(item).strip()]
|
||||
if glosses:
|
||||
senses.append(
|
||||
{
|
||||
"source": "babelnet",
|
||||
"id": best_synset.get("id"),
|
||||
"definition": glosses[0],
|
||||
"lemmas": dedupe(str(item) for item in best_synset.get("senses", []) or []),
|
||||
"topics": dedupe(
|
||||
[str(best_synset.get("topic", "")).strip()]
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": round(min(max(float(best_synset.get("topic_score", 0)) / 100.0, 0.4), 0.95), 2),
|
||||
}
|
||||
)
|
||||
|
||||
return senses
|
||||
|
||||
|
||||
def collect_geo_tags(entry: Dict[str, object]) -> List[str]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
tags: List[str] = []
|
||||
if isinstance(babelnet, dict):
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for category in synset.get("categories", []) or []:
|
||||
text = str(category).lower()
|
||||
if any(keyword in text for keyword in ("comuni_", "province_", "regioni_", "città", "paesi", "località")):
|
||||
tags.append("toponym_possible")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def collect_name_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
if form[:1].isupper():
|
||||
tags.append("capitalized_form")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def should_review(entry: Dict[str, object], topic_scores: Dict[str, int], senses: List[Dict[str, object]]) -> bool:
|
||||
existing_topics = [str(topic).lower() for topic in entry.get("topics", []) or []]
|
||||
best_score = max(topic_scores.values(), default=0)
|
||||
strong_topics = [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
babelnet_status = str((entry.get("babelnet", {}) or {}).get("status", ""))
|
||||
|
||||
if existing_topics == ["general"] and not strong_topics:
|
||||
return True
|
||||
if babelnet_status == "ambiguous" and best_score < 50:
|
||||
return True
|
||||
if len(senses) >= 3 and len(strong_topics) >= 2:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def promoted_topics(
|
||||
existing_topics: List[str], topic_scores: Dict[str, int], replace_general: bool, min_topic_score: int
|
||||
) -> List[str]:
|
||||
inferred = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0])) if score >= min_topic_score]
|
||||
existing_clean = dedupe(existing_topics)
|
||||
|
||||
if replace_general and existing_clean == ["general"] and inferred:
|
||||
return inferred
|
||||
|
||||
return dedupe(existing_clean + inferred)
|
||||
|
||||
|
||||
def refine_entry(entry: Dict[str, object], replace_general: bool, min_topic_score: int) -> Dict[str, object]:
|
||||
refined = deepcopy(entry)
|
||||
topic_scores = infer_topic_scores(refined)
|
||||
semantic_tags = collect_semantic_tags(refined)
|
||||
senses = collect_senses(refined, topic_scores)
|
||||
geo_tags = collect_geo_tags(refined)
|
||||
name_tags = collect_name_tags(refined)
|
||||
current_topics = [str(topic) for topic in refined.get("topics", []) or []]
|
||||
|
||||
refined["topics"] = promoted_topics(current_topics, topic_scores, replace_general, min_topic_score)
|
||||
refined["semantic_tags"] = semantic_tags
|
||||
refined["senses"] = senses
|
||||
refined["topic_confidence"] = topic_scores
|
||||
refined["topic_suggestions"] = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0]))]
|
||||
refined["geo_tags"] = geo_tags
|
||||
refined["name_tags"] = name_tags
|
||||
refined["needs_review"] = should_review(refined, topic_scores, senses)
|
||||
return refined
|
||||
|
||||
|
||||
def build_refined_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico di input non valido: {args.input}")
|
||||
|
||||
refined_entries = [
|
||||
refine_entry(entry, args.replace_general, args.min_topic_score)
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
]
|
||||
|
||||
review_count = sum(1 for entry in refined_entries if entry.get("needs_review"))
|
||||
topicful_count = sum(1 for entry in refined_entries if len(entry.get("topic_suggestions", []) or []) > 0)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(refined_entries),
|
||||
"replace_general": args.replace_general,
|
||||
"min_topic_score": args.min_topic_score,
|
||||
"review_count": review_count,
|
||||
"topicful_count": topicful_count,
|
||||
},
|
||||
"entries": refined_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_refined_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico raffinato generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Voci con suggerimenti di topic: {payload['meta']['topicful_count']}")
|
||||
print(f"Voci marcate needs_review: {payload['meta']['review_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user