alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.babelnet_cache.json
|
||||
.wiktionary_cache.json
|
||||
.wiktextract_it_index.json
|
||||
.babelnet_api_key.local
|
||||
logs/
|
||||
raw-wiktextract-data.jsonl
|
||||
lexicon_it*.json
|
||||
llm_rescue_patch.json
|
||||
treccani_rescue_patch.json
|
||||
to_be_review*.json
|
||||
_*.json
|
||||
idee.txt
|
||||
162
apply_llm_rescue_patch.py
Normal file
162
apply_llm_rescue_patch.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
|
||||
DEFAULT_LEXICON_PATH = Path(__file__).with_name("lexicon_it_curated.json")
|
||||
DEFAULT_PATCH_PATH = Path(__file__).with_name("llm_rescue_patch.json")
|
||||
DEFAULT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated_llm.json")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Applica una patch LLM rescue al lessico curato per produrre un lessico operativo aggiornato."
|
||||
)
|
||||
parser.add_argument("--lexicon", type=Path, default=DEFAULT_LEXICON_PATH, help="Lessico curato di partenza.")
|
||||
parser.add_argument("--patch", type=Path, default=DEFAULT_PATCH_PATH, help="Patch LLM rescue da applicare.")
|
||||
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT_PATH, help="Lessico aggiornato in uscita.")
|
||||
parser.add_argument(
|
||||
"--min-confidence",
|
||||
type=float,
|
||||
default=0.6,
|
||||
help="Confidenza minima per applicare automaticamente una definizione rescue.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-needs-review",
|
||||
action="store_true",
|
||||
help="Applica anche voci marcate needs_human_review=true se superano la soglia di confidenza.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def normalize_key(form: str, pos: str) -> Tuple[str, str]:
|
||||
return (str(form or "").strip().lower(), str(pos or "").strip().upper())
|
||||
|
||||
|
||||
def merge_topics(existing: List[str], incoming: List[str]) -> List[str]:
|
||||
merged: List[str] = []
|
||||
seen = set()
|
||||
for item in list(existing or []) + list(incoming or []):
|
||||
value = str(item).strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(value)
|
||||
return merged
|
||||
|
||||
|
||||
def apply_patch(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
lexicon_payload = load_json(args.lexicon, {"entries": []})
|
||||
patch_payload = load_json(args.patch, {"entries": []})
|
||||
if not isinstance(lexicon_payload, dict):
|
||||
raise ValueError(f"Lessico non valido: {args.lexicon}")
|
||||
lexicon = lexicon_payload.get("entries")
|
||||
if not isinstance(lexicon, list):
|
||||
raise ValueError(f"Lessico non valido: {args.lexicon}")
|
||||
if not isinstance(patch_payload, dict):
|
||||
raise ValueError(f"Patch non valida: {args.patch}")
|
||||
|
||||
patch_entries = patch_payload.get("entries") or []
|
||||
patch_by_key = {}
|
||||
for entry in patch_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
patch_by_key[normalize_key(entry.get("form", ""), entry.get("pos", ""))] = entry
|
||||
|
||||
applied = 0
|
||||
skipped = 0
|
||||
for entry in lexicon:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
patch = patch_by_key.get(normalize_key(entry.get("form", ""), entry.get("pos", "")))
|
||||
if not patch:
|
||||
continue
|
||||
confidence = float(patch.get("confidence", 0.0) or 0.0)
|
||||
needs_review = bool(patch.get("needs_human_review", True))
|
||||
definition = str(patch.get("rescue_definition", "")).strip()
|
||||
if not definition:
|
||||
skipped += 1
|
||||
continue
|
||||
if confidence < float(args.min_confidence):
|
||||
skipped += 1
|
||||
continue
|
||||
if needs_review and not args.include_needs_review:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
entry["preferred_definition"] = definition
|
||||
entry["preferred_source"] = patch.get("rescue_source", "llm_rescue")
|
||||
clue_defs = entry.get("clue_definitions") or {}
|
||||
if not isinstance(clue_defs, dict):
|
||||
clue_defs = {}
|
||||
for level in ("easy", "medium", "hard", "expert"):
|
||||
clue_defs[level] = definition
|
||||
entry["clue_definitions"] = clue_defs
|
||||
|
||||
entry["topics"] = merge_topics(entry.get("topics", []), patch.get("rescue_topics", []))
|
||||
entry["semantic_tags"] = merge_topics(entry.get("semantic_tags", []), patch.get("rescue_semantic_tags", []))
|
||||
entry["alpha_ready"] = True
|
||||
review_reasons = [reason for reason in (entry.get("review_reasons") or []) if reason != "no_viable_definition"]
|
||||
if not args.include_needs_review:
|
||||
review_reasons = [reason for reason in review_reasons if reason != "flagged_by_refined_stage"]
|
||||
entry["review_reasons"] = review_reasons
|
||||
entry["llm_rescue"] = {
|
||||
"definition": definition,
|
||||
"source": patch.get("rescue_source", "llm_rescue"),
|
||||
"topics": patch.get("rescue_topics", []),
|
||||
"semantic_tags": patch.get("rescue_semantic_tags", []),
|
||||
"notes": patch.get("rescue_notes", ""),
|
||||
"confidence": confidence,
|
||||
"needs_human_review": needs_review,
|
||||
"status": patch.get("status", ""),
|
||||
}
|
||||
applied += 1
|
||||
|
||||
meta = dict(lexicon_payload.get("meta") or {})
|
||||
meta["base_lexicon"] = args.lexicon.name
|
||||
meta["generated_from_patch"] = args.patch.name
|
||||
meta["generated_by"] = "apply_llm_rescue_patch.py"
|
||||
meta["entry_count"] = len(lexicon)
|
||||
meta["llm_rescue_applied"] = applied
|
||||
meta["llm_rescue_skipped"] = skipped
|
||||
meta["alpha_ready_count"] = sum(1 for item in lexicon if isinstance(item, dict) and item.get("alpha_ready"))
|
||||
meta["review_count"] = sum(
|
||||
1
|
||||
for item in lexicon
|
||||
if isinstance(item, dict) and (item.get("review_reasons") or item.get("needs_review"))
|
||||
)
|
||||
output_payload = {"meta": meta, "entries": lexicon}
|
||||
write_json(args.output, output_payload)
|
||||
return {
|
||||
"applied": applied,
|
||||
"skipped": skipped,
|
||||
"output": str(args.output),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = apply_patch(args)
|
||||
print(f"Lessico aggiornato generato: {result['output']}")
|
||||
print(f"Patch applicate: {result['applied']}")
|
||||
print(f"Voci saltate: {result['skipped']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
490
babelnet_daily_batch.py
Normal file
490
babelnet_daily_batch.py
Normal file
@@ -0,0 +1,490 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from babelnet_incremental_enricher import (
|
||||
DEFAULT_TOPIC,
|
||||
merge_babelnet_entries,
|
||||
rebuild_enriched,
|
||||
)
|
||||
from build_babelnet_enrichment import (
|
||||
BABELNET_CACHE_PATH,
|
||||
BABELNET_ENV_KEY,
|
||||
BABELNET_OUTPUT_PATH,
|
||||
BabelNetApiCallLimitReached,
|
||||
BabelNetKeyUnavailable,
|
||||
POS_TO_BABELNET,
|
||||
enrich_entry,
|
||||
load_babelnet_api_keys,
|
||||
load_json,
|
||||
write_json,
|
||||
)
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
LOG_DIR = Path(__file__).with_name("logs")
|
||||
DEFAULT_API_CALL_LIMIT = 950
|
||||
DEFAULT_PER_KEY_API_CALL_LIMIT = 950
|
||||
DEFAULT_WORD_LIMIT = 10_000
|
||||
MIN_WORD_LENGTH = 3
|
||||
MAX_WORD_LENGTH = 16
|
||||
USEFUL_POS_PRIORITY = {
|
||||
"NOUN": 6,
|
||||
"VERB": 5,
|
||||
"ADJ": 4,
|
||||
"ADV": 3,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Batch giornaliero per fondere progressivamente ItalWordNet e BabelNet: "
|
||||
"arricchisce parole mancanti, aggiorna lexicon_it_babelnet.json e rigenera lexicon_it_enriched.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-call-limit",
|
||||
type=int,
|
||||
default=DEFAULT_API_CALL_LIMIT,
|
||||
help="Numero massimo complessivo di chiamate API BabelNet reali consentite in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per-key-api-call-limit",
|
||||
type=int,
|
||||
default=DEFAULT_PER_KEY_API_CALL_LIMIT,
|
||||
help="Numero massimo di chiamate API reali consentite per ciascuna chiave caricata.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token-index",
|
||||
default=None,
|
||||
help="Usa una o piu chiavi locali, contando da 1. Esempi: --token-index 2 oppure --token-index 1,2,3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token-indexes",
|
||||
default=None,
|
||||
help="Alias esplicito per una lista di chiavi locali. Esempio: --token-indexes 1,2,3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=DEFAULT_WORD_LIMIT,
|
||||
help="Numero massimo di parole candidate da tentare in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa tra richieste API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale per concentrare il batch su una parte del lessico.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-not-crossword",
|
||||
action="store_true",
|
||||
help="Include anche voci non marcate allowed_in_crossword.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-no-match",
|
||||
action="store_true",
|
||||
help="Riprova anche parole gia marcate come no_match.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Mostra le prossime parole candidate senza chiamare BabelNet e senza scrivere file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-cache",
|
||||
action="store_true",
|
||||
help="Ignora la cache in questa esecuzione diagnostica, utile per testare un token specifico.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="Archivio incrementale degli arricchimenti BabelNet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enriched",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico fuso da rigenerare dopo il batch.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]:
|
||||
if enriched_path.exists():
|
||||
payload = load_json(enriched_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
payload = load_json(semantic_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}")
|
||||
|
||||
|
||||
def babelnet_status(entry: Dict[str, object]) -> str:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
return str(babelnet.get("status", "not_requested"))
|
||||
return "not_requested"
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> set[str]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) or [] if item}
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
topics.update(str(item).lower() for item in semantic.get("semantic_topics", []) or [] if item)
|
||||
return topics
|
||||
|
||||
|
||||
def eligible_entry(entry: Dict[str, object], args: argparse.Namespace) -> bool:
|
||||
word = str(entry.get("form", "")).strip().lower()
|
||||
pos = str(entry.get("pos", "")).strip().upper()
|
||||
status = babelnet_status(entry)
|
||||
allowed_statuses = {"not_requested", "api_error"}
|
||||
if args.retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
if status not in allowed_statuses:
|
||||
return False
|
||||
if pos not in POS_TO_BABELNET:
|
||||
return False
|
||||
if not word.isalpha() or not MIN_WORD_LENGTH <= len(word) <= MAX_WORD_LENGTH:
|
||||
return False
|
||||
if not args.include_not_crossword and not entry.get("allowed_in_crossword", False):
|
||||
return False
|
||||
if args.topic and args.topic.strip().lower() not in entry_topics(entry):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def candidate_priority(entry: Dict[str, object]) -> Tuple[int, int, int, int, int, str]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", "")).upper()
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) or []}
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = set()
|
||||
if isinstance(semantic, dict):
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) or []}
|
||||
|
||||
useful_topic_bonus = 2 if topics - {DEFAULT_TOPIC, "abstract", "actions"} else 0
|
||||
semantic_topic_bonus = 1 if semantic_topics else 0
|
||||
length_bonus = 3 if 4 <= len(word) <= 11 else 1
|
||||
return (
|
||||
useful_topic_bonus,
|
||||
semantic_topic_bonus,
|
||||
int(entry.get("quality_score", 0)),
|
||||
USEFUL_POS_PRIORITY.get(pos, 0),
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
|
||||
def select_candidates(payload: Dict[str, object], args: argparse.Namespace) -> List[Dict[str, object]]:
|
||||
candidates = [
|
||||
entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and eligible_entry(entry, args)
|
||||
]
|
||||
candidates.sort(key=candidate_priority, reverse=True)
|
||||
return candidates[: max(0, args.word_limit)]
|
||||
|
||||
|
||||
def progress_counts(payload: Dict[str, object]) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = {}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
status = babelnet_status(entry)
|
||||
counts[status] = counts.get(status, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def parse_token_indexes(value: Optional[str], key_count: int, option_name: str) -> Optional[List[int]]:
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
selected: List[int] = []
|
||||
seen = set()
|
||||
for raw_part in str(value).replace(";", ",").split(","):
|
||||
part = raw_part.strip()
|
||||
if not part:
|
||||
continue
|
||||
try:
|
||||
index = int(part)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(f"{option_name} deve contenere solo numeri separati da virgola.") from exc
|
||||
if not 1 <= index <= key_count:
|
||||
raise SystemExit(
|
||||
f"{option_name} contiene {index}, ma deve essere tra 1 e {key_count}. Chiavi caricate: {key_count}."
|
||||
)
|
||||
if index in seen:
|
||||
continue
|
||||
selected.append(index)
|
||||
seen.add(index)
|
||||
|
||||
if not selected:
|
||||
raise SystemExit(f"{option_name} non contiene nessun indice valido.")
|
||||
return selected
|
||||
|
||||
|
||||
def write_batch_log(payload: Dict[str, object]) -> Path:
|
||||
LOG_DIR.mkdir(exist_ok=True)
|
||||
timestamp = datetime.now().astimezone().strftime("%Y%m%d_%H%M%S")
|
||||
path = LOG_DIR / f"babelnet_batch_{timestamp}.json"
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
def run_batch(args: argparse.Namespace) -> Dict[str, object]:
|
||||
source_payload = load_source_payload(args.enriched, args.semantic)
|
||||
candidates = select_candidates(source_payload, args)
|
||||
before_counts = progress_counts(source_payload)
|
||||
|
||||
if args.dry_run:
|
||||
return {
|
||||
"mode": "dry-run",
|
||||
"candidate_count": len(candidates),
|
||||
"selected_words": [entry.get("form") for entry in candidates[:50]],
|
||||
"before_counts": before_counts,
|
||||
}
|
||||
|
||||
api_keys = load_babelnet_api_keys()
|
||||
if not api_keys:
|
||||
raise SystemExit(
|
||||
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure crea .babelnet_api_key.local."
|
||||
)
|
||||
token_indexes = parse_token_indexes(args.token_index, len(api_keys), "--token-index")
|
||||
token_indexes_alias = parse_token_indexes(args.token_indexes, len(api_keys), "--token-indexes")
|
||||
if token_indexes and token_indexes_alias:
|
||||
raise SystemExit("Usa solo uno tra --token-index e --token-indexes.")
|
||||
selected_token_indexes = token_indexes or token_indexes_alias
|
||||
if selected_token_indexes:
|
||||
api_keys = [api_keys[index - 1] for index in selected_token_indexes]
|
||||
|
||||
cache = {} if args.ignore_cache else load_json(BABELNET_CACHE_PATH, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
global_stats = {
|
||||
"api_calls": 0,
|
||||
"cache_hits": 0,
|
||||
"responses": 0,
|
||||
"api_call_limit": max(0, args.api_call_limit),
|
||||
}
|
||||
per_key_limit = max(0, args.per_key_api_call_limit)
|
||||
key_stats = [
|
||||
{
|
||||
"key_index": selected_token_indexes[index] if selected_token_indexes else index + 1,
|
||||
"local_key_index": index + 1,
|
||||
"api_calls": 0,
|
||||
"cache_hits": 0,
|
||||
"responses": 0,
|
||||
"api_call_limit": per_key_limit,
|
||||
}
|
||||
for index, _ in enumerate(api_keys)
|
||||
]
|
||||
enriched_entries: List[Dict[str, object]] = []
|
||||
word_logs = []
|
||||
stopped_reason = "completed"
|
||||
|
||||
def select_key_index() -> Optional[int]:
|
||||
available = [
|
||||
(stats["api_calls"], index)
|
||||
for index, stats in enumerate(key_stats)
|
||||
if stats["api_calls"] < stats["api_call_limit"]
|
||||
]
|
||||
if not available:
|
||||
return None
|
||||
available.sort()
|
||||
return available[0][1]
|
||||
|
||||
for index, entry in enumerate(candidates, start=1):
|
||||
if global_stats["api_calls"] >= global_stats["api_call_limit"]:
|
||||
stopped_reason = "api_call_limit"
|
||||
break
|
||||
key_index = select_key_index()
|
||||
if key_index is None:
|
||||
stopped_reason = "per_key_api_call_limit"
|
||||
break
|
||||
|
||||
before_api_calls = global_stats["api_calls"]
|
||||
before_cache_hits = global_stats["cache_hits"]
|
||||
before_responses = global_stats["responses"]
|
||||
before_key_api_calls = key_stats[key_index]["api_calls"]
|
||||
before_key_cache_hits = key_stats[key_index]["cache_hits"]
|
||||
before_key_responses = key_stats[key_index]["responses"]
|
||||
|
||||
updated = deepcopy(entry)
|
||||
updated.pop("babelnet", None)
|
||||
try:
|
||||
updated["babelnet"] = enrich_entry(updated, api_keys[key_index], cache, args.sleep, key_stats[key_index])
|
||||
except BabelNetApiCallLimitReached:
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
stopped_reason = "per_key_api_call_limit"
|
||||
break
|
||||
except BabelNetKeyUnavailable as exc:
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
key_stats[key_index]["api_calls"] = key_stats[key_index]["api_call_limit"]
|
||||
word_logs.append(
|
||||
{
|
||||
"index": index,
|
||||
"word": updated.get("form"),
|
||||
"pos": updated.get("pos"),
|
||||
"key_index": key_stats[key_index]["key_index"],
|
||||
"api_calls": global_stats["api_calls"] - before_api_calls,
|
||||
"cache_hits": global_stats["cache_hits"] - before_cache_hits,
|
||||
"responses": global_stats["responses"] - before_responses,
|
||||
"matched": False,
|
||||
"synsets": 0,
|
||||
"reason": "key_unavailable_or_daily_limit",
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {updated.get('form')}: "
|
||||
f"token={key_stats[key_index]['key_index']} non disponibile o limite giornaliero raggiunto"
|
||||
)
|
||||
if select_key_index() is None:
|
||||
stopped_reason = "all_keys_unavailable_or_daily_limit"
|
||||
break
|
||||
continue
|
||||
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
|
||||
enriched_entries.append(updated)
|
||||
write_json(BABELNET_CACHE_PATH, cache)
|
||||
|
||||
word_log = {
|
||||
"index": index,
|
||||
"word": updated.get("form"),
|
||||
"pos": updated.get("pos"),
|
||||
"key_index": key_stats[key_index]["key_index"],
|
||||
"api_calls": global_stats["api_calls"] - before_api_calls,
|
||||
"cache_hits": global_stats["cache_hits"] - before_cache_hits,
|
||||
"responses": global_stats["responses"] - before_responses,
|
||||
"matched": bool(updated.get("babelnet", {}).get("matched")),
|
||||
"synsets": len(updated.get("babelnet", {}).get("synsets", []) or []),
|
||||
"reason": updated.get("babelnet", {}).get("reason"),
|
||||
}
|
||||
word_logs.append(word_log)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {word_log['word']}: "
|
||||
f"token={word_log['key_index']} api_calls={word_log['api_calls']} cache_hits={word_log['cache_hits']} "
|
||||
f"match={word_log['matched']} tot_api={global_stats['api_calls']}/{global_stats['api_call_limit']}"
|
||||
)
|
||||
|
||||
merged_babelnet = merge_babelnet_entries(
|
||||
babelnet_payload,
|
||||
enriched_entries,
|
||||
args.topic or "all",
|
||||
"all",
|
||||
)
|
||||
write_json(args.babelnet, merged_babelnet)
|
||||
enriched_payload = rebuild_enriched(
|
||||
args.semantic,
|
||||
args.babelnet,
|
||||
args.enriched,
|
||||
args.topic or DEFAULT_TOPIC,
|
||||
)
|
||||
after_counts = progress_counts(enriched_payload)
|
||||
|
||||
total_entries = int(enriched_payload.get("meta", {}).get("entry_count", 0))
|
||||
covered = total_entries - after_counts.get("not_requested", 0)
|
||||
coverage = covered / total_entries if total_entries else 0.0
|
||||
|
||||
result = {
|
||||
"mode": "batch",
|
||||
"started_topic": args.topic,
|
||||
"stopped_reason": stopped_reason,
|
||||
"candidate_count": len(candidates),
|
||||
"attempted_words": len(enriched_entries),
|
||||
"matched_words": sum(1 for entry in enriched_entries if entry.get("babelnet", {}).get("matched")),
|
||||
"api_calls": global_stats["api_calls"],
|
||||
"cache_hits": global_stats["cache_hits"],
|
||||
"responses": global_stats["responses"],
|
||||
"api_call_limit": global_stats["api_call_limit"],
|
||||
"api_key_count": len(api_keys),
|
||||
"forced_token_indexes": selected_token_indexes,
|
||||
"per_key_api_call_limit": per_key_limit,
|
||||
"per_key_stats": key_stats,
|
||||
"before_counts": before_counts,
|
||||
"after_counts": after_counts,
|
||||
"total_entries": total_entries,
|
||||
"covered_entries": covered,
|
||||
"coverage_ratio": coverage,
|
||||
"word_logs": word_logs,
|
||||
}
|
||||
log_path = write_batch_log(result)
|
||||
result["log_path"] = str(log_path)
|
||||
return result
|
||||
|
||||
|
||||
def print_result(result: Dict[str, object]) -> None:
|
||||
if result["mode"] == "dry-run":
|
||||
print("Dry-run batch BabelNet")
|
||||
print(f"Candidate selezionate: {result['candidate_count']}")
|
||||
print(f"Stati iniziali: {result['before_counts']}")
|
||||
print("Prime parole:")
|
||||
for index, word in enumerate(result["selected_words"], start=1):
|
||||
print(f"{index:>2}. {word}")
|
||||
return
|
||||
|
||||
print("Batch BabelNet completato")
|
||||
print(f"- motivo stop: {result['stopped_reason']}")
|
||||
print(f"- parole tentate: {result['attempted_words']}/{result['candidate_count']}")
|
||||
print(f"- parole con match: {result['matched_words']}")
|
||||
print(f"- chiamate API reali: {result['api_calls']}/{result['api_call_limit']}")
|
||||
print(f"- chiavi caricate: {result['api_key_count']} (limite per chiave: {result['per_key_api_call_limit']})")
|
||||
if result.get("forced_token_indexes"):
|
||||
print(f"- token forzati: {', '.join('#' + str(index) for index in result['forced_token_indexes'])}")
|
||||
for item in result["per_key_stats"]:
|
||||
print(f" chiave #{item['key_index']}: {item['api_calls']}/{item['api_call_limit']} chiamate API")
|
||||
print(f"- cache hit: {result['cache_hits']}")
|
||||
print(f"- copertura lessico: {result['covered_entries']}/{result['total_entries']} ({result['coverage_ratio'] * 100:.1f}%)")
|
||||
print(f"- stati dopo: {result['after_counts']}")
|
||||
print(f"- log: {result['log_path']}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = run_batch(args)
|
||||
print_result(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
583
babelnet_incremental_enricher.py
Normal file
583
babelnet_incremental_enricher.py
Normal file
@@ -0,0 +1,583 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from build_babelnet_enrichment import (
|
||||
BABELNET_CACHE_PATH,
|
||||
BABELNET_ENV_KEY,
|
||||
BABELNET_OUTPUT_PATH,
|
||||
POS_TO_BABELNET,
|
||||
enrich_entry,
|
||||
load_json,
|
||||
write_json,
|
||||
)
|
||||
from build_enriched_lexicon import (
|
||||
ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
build_enriched_lexicon,
|
||||
write_json as write_enriched_json,
|
||||
)
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
"easy": 1,
|
||||
"medium": 2,
|
||||
"hard": 4,
|
||||
"expert": 5,
|
||||
}
|
||||
DEFAULT_TOPIC = "general"
|
||||
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
|
||||
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
|
||||
GENERAL_FILL_MIN_QUALITY = 6
|
||||
GENERAL_FILL_MAX_LENGTH = 10
|
||||
SOFT_RELATED_FILL_LIMIT = 120
|
||||
CONCRETE_TOPICS = {
|
||||
"animals",
|
||||
"plants",
|
||||
"nature",
|
||||
"ecology",
|
||||
"geography",
|
||||
"weather",
|
||||
"sea",
|
||||
"mountain",
|
||||
"health",
|
||||
"science",
|
||||
"sport",
|
||||
"history",
|
||||
"school",
|
||||
"cinema",
|
||||
"literature",
|
||||
"food",
|
||||
"city",
|
||||
"transport",
|
||||
"work",
|
||||
"home",
|
||||
}
|
||||
|
||||
TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
|
||||
"bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
|
||||
"trattor", "carr", "vap", "rota", "ruot",
|
||||
),
|
||||
"animals": (
|
||||
"can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
|
||||
"serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
|
||||
),
|
||||
"nature": (
|
||||
"mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
|
||||
"rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
|
||||
"litoral", "piogg", "nev", "onda", "clim",
|
||||
),
|
||||
"cinema": (
|
||||
"film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
|
||||
"comic", "div", "docu", "pellic", "spettacol",
|
||||
),
|
||||
}
|
||||
|
||||
TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"intervist", "intratten", "speriment", "stermin", "investig",
|
||||
"intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
|
||||
"eccit", "traduz", "fluttu", "sollecit",
|
||||
),
|
||||
"animals": (
|
||||
"assicur", "finanz", "coediz", "camerier", "servitor", "indic",
|
||||
"estens", "diffus", "difensor", "spessor", "maggior",
|
||||
),
|
||||
"cinema": (
|
||||
"manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
|
||||
"malumor", "eversor",
|
||||
),
|
||||
}
|
||||
|
||||
ENRICHABLE_STATUSES = {"not_requested", "api_error"}
|
||||
|
||||
BABELNET_TOPIC_SAFE_PREFIXES: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"ambul",
|
||||
"aer",
|
||||
"autobus",
|
||||
"autocar",
|
||||
"automob",
|
||||
"autostrad",
|
||||
"autoveic",
|
||||
"autovett",
|
||||
"bicicl",
|
||||
"ciclo",
|
||||
"imbarc",
|
||||
"locom",
|
||||
"motoc",
|
||||
"motr",
|
||||
"navig",
|
||||
"rimorch",
|
||||
"trattor",
|
||||
"tren",
|
||||
"veicol",
|
||||
"vettur",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def parse_difficulty(value: str) -> int:
|
||||
text = str(value).strip().lower()
|
||||
if text in DIFFICULTY_ALIASES:
|
||||
return DIFFICULTY_ALIASES[text]
|
||||
try:
|
||||
level = int(text)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(
|
||||
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
|
||||
) from exc
|
||||
if not 1 <= level <= 5:
|
||||
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
|
||||
return level
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce incrementalmente il lessico: seleziona parole mancanti, "
|
||||
"chiama BabelNet entro un limite e rigenera lexicon_it_enriched.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.environ.get(BABELNET_ENV_KEY),
|
||||
help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=DEFAULT_TOPIC,
|
||||
help="Topic per cui scegliere le prossime parole da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--difficulty",
|
||||
default="medium",
|
||||
help="Difficolta massima: easy, medium, hard, expert oppure 1-5.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Numero massimo di parole da arricchire in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa tra richieste API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="Archivio degli arricchimenti BabelNet parziali.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enriched",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico arricchito da aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Mostra le parole candidate senza chiamare BabelNet e senza scrivere file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-no-match",
|
||||
action="store_true",
|
||||
help="Riprova anche parole gia marcate come no_match.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Parole specifiche da arricchire, utile per generare definizioni sul cruciverba finale.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def dedupe(items: Iterable[Dict[str, object]]) -> List[Dict[str, object]]:
|
||||
seen = set()
|
||||
result = []
|
||||
for item in items:
|
||||
key = entry_key(item)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> Tuple[set[str], set[str]]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) if item}
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = set()
|
||||
if isinstance(semantic, dict):
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item}
|
||||
return topics, semantic_topics
|
||||
|
||||
|
||||
def current_babelnet_status(entry: Dict[str, object]) -> str:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
return str(babelnet.get("status", "not_requested"))
|
||||
return "not_requested"
|
||||
|
||||
|
||||
def matches_topic_roots(word: str, topic: str) -> bool:
|
||||
roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(topic, ())
|
||||
return not roots or any(root in word for root in roots)
|
||||
|
||||
|
||||
def matches_safe_babelnet_roots(word: str, topic: str) -> bool:
|
||||
prefixes = BABELNET_TOPIC_SAFE_PREFIXES.get(topic)
|
||||
if prefixes is None:
|
||||
return False
|
||||
return any(word.startswith(prefix) for prefix in prefixes)
|
||||
|
||||
|
||||
def is_blocked_for_topic(word: str, topic: str) -> bool:
|
||||
return any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(topic, ()))
|
||||
|
||||
|
||||
def topic_score(entry: Dict[str, object], topic: str) -> int:
|
||||
if topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
|
||||
word = str(entry.get("form", "")).lower()
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
score = 0
|
||||
if topic in topics:
|
||||
score += 100
|
||||
if topic in semantic_topics:
|
||||
score += 45
|
||||
if matches_topic_roots(word, topic):
|
||||
score += 35
|
||||
if DEFAULT_TOPIC in topics:
|
||||
score += 5
|
||||
if is_blocked_for_topic(word, topic):
|
||||
score -= 100
|
||||
if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
score -= 30
|
||||
return score
|
||||
|
||||
|
||||
def candidate_score(entry: Dict[str, object], topic: str) -> Tuple[int, int, int, int, int, str]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", ""))
|
||||
pos_bonus = {
|
||||
"NOUN": 12,
|
||||
"VERB": 8,
|
||||
"ADJ": 6,
|
||||
"ADV": 4,
|
||||
}.get(pos, 0)
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_bonus = 3 if isinstance(semantic, dict) and semantic.get("matched") else 0
|
||||
length_bonus = 4 if 4 <= len(word) <= 10 else 1 if len(word) <= 14 else -3
|
||||
return (
|
||||
topic_score(entry, topic),
|
||||
int(entry.get("quality_score", 0)),
|
||||
pos_bonus,
|
||||
semantic_bonus,
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
|
||||
def eligible_for_babelnet(entry: Dict[str, object], topic: str, difficulty_level: int, retry_no_match: bool) -> bool:
|
||||
word = str(entry.get("form", "")).lower()
|
||||
pos = str(entry.get("pos", ""))
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
status = current_babelnet_status(entry)
|
||||
allowed_statuses = set(ENRICHABLE_STATUSES)
|
||||
if retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
if status not in allowed_statuses:
|
||||
return False
|
||||
if not word.isalpha() or len(word) < 3 or len(word) > 16:
|
||||
return False
|
||||
if pos not in POS_TO_BABELNET or pos not in FILL_ALLOWED_POS:
|
||||
return False
|
||||
if int(entry.get("difficulty_word", 5)) > difficulty_level:
|
||||
return False
|
||||
if not entry.get("allowed_in_crossword", False):
|
||||
return False
|
||||
if topic != DEFAULT_TOPIC:
|
||||
if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
return False
|
||||
conservative_match = topic in topics
|
||||
safe_root_match = matches_safe_babelnet_roots(word, topic)
|
||||
semantic_only_match = topic in semantic_topics and topic not in CONCRETE_TOPICS
|
||||
if not (conservative_match or safe_root_match or semantic_only_match):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def select_candidates(payload: Dict[str, object], topic: str, difficulty_level: int, limit: int, retry_no_match: bool) -> List[Dict[str, object]]:
|
||||
entries = [
|
||||
entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and eligible_for_babelnet(entry, topic, difficulty_level, retry_no_match)
|
||||
]
|
||||
|
||||
if topic != DEFAULT_TOPIC:
|
||||
strong = [entry for entry in entries if topic in entry_topics(entry)[0]]
|
||||
soft = [
|
||||
entry
|
||||
for entry in entries
|
||||
if entry not in strong
|
||||
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||
and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
|
||||
]
|
||||
support = [
|
||||
entry
|
||||
for entry in entries
|
||||
if entry not in strong
|
||||
and entry not in soft
|
||||
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||
and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
|
||||
]
|
||||
entries = strong + sorted(soft, key=lambda item: candidate_score(item, topic), reverse=True)[:SOFT_RELATED_FILL_LIMIT]
|
||||
entries += sorted(support, key=lambda item: candidate_score(item, topic), reverse=True)
|
||||
|
||||
entries = dedupe(entries)
|
||||
entries.sort(key=lambda item: candidate_score(item, topic), reverse=True)
|
||||
return entries[:limit]
|
||||
|
||||
|
||||
def select_word_candidates(
|
||||
payload: Dict[str, object],
|
||||
words: Iterable[str],
|
||||
limit: int,
|
||||
retry_no_match: bool,
|
||||
) -> List[Dict[str, object]]:
|
||||
requested = []
|
||||
seen_words = set()
|
||||
for word in words:
|
||||
normalized = str(word).strip().lower()
|
||||
if normalized and normalized not in seen_words:
|
||||
requested.append(normalized)
|
||||
seen_words.add(normalized)
|
||||
|
||||
by_word = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
selected = []
|
||||
allowed_statuses = set(ENRICHABLE_STATUSES)
|
||||
if retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
for word in requested:
|
||||
entry = by_word.get(word)
|
||||
if not entry:
|
||||
continue
|
||||
status = current_babelnet_status(entry)
|
||||
if status not in allowed_statuses:
|
||||
continue
|
||||
if str(entry.get("pos", "")) not in POS_TO_BABELNET:
|
||||
continue
|
||||
if not str(entry.get("form", "")).isalpha():
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]:
|
||||
if enriched_path.exists():
|
||||
payload = load_json(enriched_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
payload = load_json(semantic_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}")
|
||||
|
||||
|
||||
def merge_babelnet_entries(existing_payload: Dict[str, object], new_entries: List[Dict[str, object]], topic: str, difficulty: str) -> Dict[str, object]:
|
||||
existing_entries = [
|
||||
entry for entry in existing_payload.get("entries", []) or [] if isinstance(entry, dict)
|
||||
]
|
||||
index = {entry_key(entry): deepcopy(entry) for entry in existing_entries}
|
||||
generated_at = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
|
||||
for entry in new_entries:
|
||||
updated = deepcopy(entry)
|
||||
updated["babelnet_generated_at"] = generated_at
|
||||
index[entry_key(updated)] = updated
|
||||
|
||||
entries = sorted(index.values(), key=lambda item: (str(item.get("form", "")), str(item.get("pos", ""))))
|
||||
meta = dict(existing_payload.get("meta", {})) if isinstance(existing_payload.get("meta", {}), dict) else {}
|
||||
meta.update(
|
||||
{
|
||||
"language": meta.get("language", "it"),
|
||||
"version": max(1, int(meta.get("version", 1))),
|
||||
"source": "BabelNet API",
|
||||
"updated_at": generated_at,
|
||||
"last_topic": topic,
|
||||
"last_difficulty": difficulty,
|
||||
"entry_count": len(entries),
|
||||
}
|
||||
)
|
||||
return {"meta": meta, "entries": entries}
|
||||
|
||||
|
||||
def rebuild_enriched(semantic_path: Path, babelnet_path: Path, enriched_path: Path, topic: str) -> Dict[str, object]:
|
||||
namespace = SimpleNamespace(
|
||||
semantic=semantic_path,
|
||||
babelnet=babelnet_path,
|
||||
output=enriched_path,
|
||||
topic=topic,
|
||||
)
|
||||
payload = build_enriched_lexicon(namespace)
|
||||
write_enriched_json(enriched_path, payload)
|
||||
return payload
|
||||
|
||||
|
||||
def run_incremental_enrichment(args: argparse.Namespace) -> Dict[str, object]:
|
||||
normalized_topic = args.topic.strip().lower()
|
||||
difficulty_level = parse_difficulty(str(args.difficulty))
|
||||
source_payload = load_source_payload(args.enriched, args.semantic)
|
||||
target_words = getattr(args, "words", None)
|
||||
if target_words:
|
||||
candidates = select_word_candidates(
|
||||
source_payload,
|
||||
target_words,
|
||||
max(0, args.limit),
|
||||
args.retry_no_match,
|
||||
)
|
||||
else:
|
||||
candidates = select_candidates(
|
||||
source_payload,
|
||||
normalized_topic,
|
||||
difficulty_level,
|
||||
max(0, args.limit),
|
||||
args.retry_no_match,
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
return {
|
||||
"mode": "dry-run",
|
||||
"topic": normalized_topic,
|
||||
"difficulty": args.difficulty,
|
||||
"selected_count": len(candidates),
|
||||
"selected_words": [entry.get("form") for entry in candidates],
|
||||
}
|
||||
|
||||
if not args.api_key:
|
||||
raise SystemExit(
|
||||
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
|
||||
)
|
||||
|
||||
cache = load_json(BABELNET_CACHE_PATH, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
enriched_candidates = []
|
||||
word_logs = []
|
||||
for index, entry in enumerate(candidates, start=1):
|
||||
updated = deepcopy(entry)
|
||||
updated.pop("babelnet", None)
|
||||
stats = {"api_calls": 0, "cache_hits": 0, "responses": 0}
|
||||
updated["babelnet"] = enrich_entry(updated, args.api_key, cache, args.sleep, stats)
|
||||
enriched_candidates.append(updated)
|
||||
write_json(BABELNET_CACHE_PATH, cache)
|
||||
word_logs.append(
|
||||
{
|
||||
"word": updated["form"],
|
||||
"api_calls": stats["api_calls"],
|
||||
"cache_hits": stats["cache_hits"],
|
||||
"responses": stats["responses"],
|
||||
"matched": bool(updated["babelnet"].get("matched")),
|
||||
"synsets": len(updated["babelnet"].get("synsets", []) or []),
|
||||
"reason": updated["babelnet"].get("reason"),
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {updated['form']}: "
|
||||
f"api_calls={stats['api_calls']} cache_hits={stats['cache_hits']} "
|
||||
f"risposta={stats['responses'] > 0} match={updated['babelnet'].get('matched')}"
|
||||
)
|
||||
|
||||
merged_babelnet = merge_babelnet_entries(
|
||||
babelnet_payload,
|
||||
enriched_candidates,
|
||||
normalized_topic,
|
||||
str(args.difficulty),
|
||||
)
|
||||
write_json(args.babelnet, merged_babelnet)
|
||||
enriched_payload = rebuild_enriched(args.semantic, args.babelnet, args.enriched, normalized_topic)
|
||||
|
||||
return {
|
||||
"mode": "enriched",
|
||||
"topic": normalized_topic,
|
||||
"difficulty": args.difficulty,
|
||||
"selected_count": len(candidates),
|
||||
"matched_count": sum(1 for entry in enriched_candidates if entry.get("babelnet", {}).get("matched")),
|
||||
"api_call_count": sum(item["api_calls"] for item in word_logs),
|
||||
"cache_hit_count": sum(item["cache_hits"] for item in word_logs),
|
||||
"word_logs": word_logs,
|
||||
"babelnet_entry_count": merged_babelnet["meta"]["entry_count"],
|
||||
"enriched_status_counts": enriched_payload["meta"]["babelnet_status_counts"],
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = run_incremental_enrichment(args)
|
||||
if result["mode"] == "dry-run":
|
||||
print("Dry-run BabelNet incrementale")
|
||||
print(f"Topic: {result['topic']}")
|
||||
print(f"Difficolta: {result['difficulty']}")
|
||||
print(f"Parole selezionate: {result['selected_count']}")
|
||||
for index, word in enumerate(result["selected_words"], start=1):
|
||||
print(f"{index:2d}. {word}")
|
||||
return
|
||||
|
||||
print("Arricchimento BabelNet completato")
|
||||
print(f"Topic: {result['topic']}")
|
||||
print(f"Parole interrogate: {result['selected_count']}")
|
||||
print(f"Chiamate API BabelNet reali: {result['api_call_count']}")
|
||||
print(f"Risposte da cache: {result['cache_hit_count']}")
|
||||
print(f"Match BabelNet: {result['matched_count']}")
|
||||
for item in result["word_logs"]:
|
||||
print(
|
||||
f"- {item['word']}: api_calls={item['api_calls']}, "
|
||||
f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, "
|
||||
f"match={item['matched']}, synsets={item['synsets']}"
|
||||
)
|
||||
print(f"Voci BabelNet archiviate: {result['babelnet_entry_count']}")
|
||||
print(f"Stati lessico arricchito: {result['enriched_status_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -12,11 +12,11 @@ from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
from main import parse_difficulty
|
||||
|
||||
|
||||
BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
|
||||
BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
|
||||
BABELNET_LOCAL_KEY_PATH = Path(__file__).with_name(".babelnet_api_key.local")
|
||||
BABELNET_API_BASE = "https://babelnet.io/v9"
|
||||
BABELNET_ENV_KEY = "BABELNET_API_KEY"
|
||||
|
||||
@@ -28,14 +28,76 @@ POS_TO_BABELNET = {
|
||||
}
|
||||
|
||||
|
||||
class BabelNetApiCallLimitReached(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class BabelNetKeyUnavailable(RuntimeError):
|
||||
pass
|
||||
|
||||
DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
"easy": 1,
|
||||
"medium": 2,
|
||||
"hard": 4,
|
||||
"expert": 5,
|
||||
}
|
||||
|
||||
|
||||
def parse_difficulty(value: str) -> int:
|
||||
text = str(value).strip().lower()
|
||||
if text in DIFFICULTY_ALIASES:
|
||||
return DIFFICULTY_ALIASES[text]
|
||||
try:
|
||||
level = int(text)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(
|
||||
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
|
||||
) from exc
|
||||
if not 1 <= level <= 5:
|
||||
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
|
||||
return level
|
||||
|
||||
|
||||
def _split_api_keys(text: str) -> List[str]:
|
||||
keys = []
|
||||
seen = set()
|
||||
normalized = text.replace(";", "\n").replace(",", "\n")
|
||||
for line in normalized.splitlines():
|
||||
key = line.strip()
|
||||
if not key or key.startswith("#") or key in seen:
|
||||
continue
|
||||
keys.append(key)
|
||||
seen.add(key)
|
||||
return keys
|
||||
|
||||
|
||||
def load_babelnet_api_keys() -> List[str]:
|
||||
env_key = os.environ.get(BABELNET_ENV_KEY)
|
||||
if env_key:
|
||||
return _split_api_keys(env_key)
|
||||
if BABELNET_LOCAL_KEY_PATH.exists():
|
||||
return _split_api_keys(BABELNET_LOCAL_KEY_PATH.read_text(encoding="utf-8"))
|
||||
return []
|
||||
|
||||
|
||||
def load_babelnet_api_key() -> Optional[str]:
|
||||
keys = load_babelnet_api_keys()
|
||||
if keys:
|
||||
return keys[0]
|
||||
return None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.environ.get(BABELNET_ENV_KEY),
|
||||
help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
|
||||
default=load_babelnet_api_key(),
|
||||
help=(
|
||||
f"Chiave API BabelNet. In alternativa imposta {BABELNET_ENV_KEY} "
|
||||
f"o crea {BABELNET_LOCAL_KEY_PATH.name}."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
@@ -78,10 +140,29 @@ def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object:
|
||||
def cache_key(endpoint: str, params: Dict[str, str]) -> str:
|
||||
safe_params = {key: value for key, value in params.items() if key != "key"}
|
||||
return f"{endpoint}?{urllib.parse.urlencode(sorted(safe_params.items()))}"
|
||||
|
||||
|
||||
def request_json(
|
||||
endpoint: str,
|
||||
params: Dict[str, str],
|
||||
cache: Dict[str, object],
|
||||
stats: Optional[Dict[str, int]] = None,
|
||||
) -> object:
|
||||
url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
|
||||
if url in cache:
|
||||
return cache[url]
|
||||
key = cache_key(endpoint, params)
|
||||
if key in cache:
|
||||
if stats is not None:
|
||||
stats["cache_hits"] = stats.get("cache_hits", 0) + 1
|
||||
return cache[key]
|
||||
|
||||
if stats is not None:
|
||||
limit = stats.get("api_call_limit")
|
||||
current = stats.get("api_calls", 0)
|
||||
if limit is not None and current >= limit:
|
||||
raise BabelNetApiCallLimitReached("Limite chiamate API BabelNet raggiunto")
|
||||
|
||||
request = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
try:
|
||||
@@ -89,9 +170,14 @@ def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
if exc.code == 403:
|
||||
raise BabelNetKeyUnavailable(f"Chiave BabelNet non valida o limite giornaliero raggiunto: {detail}") from exc
|
||||
raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
|
||||
|
||||
cache[url] = payload
|
||||
cache[key] = payload
|
||||
if stats is not None:
|
||||
stats["api_calls"] = stats.get("api_calls", 0) + 1
|
||||
stats["responses"] = stats.get("responses", 0) + 1
|
||||
return payload
|
||||
|
||||
|
||||
@@ -180,7 +266,13 @@ def dedupe(items: Iterable[str]) -> List[str]:
|
||||
return result
|
||||
|
||||
|
||||
def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]:
|
||||
def enrich_entry(
|
||||
entry: Dict[str, object],
|
||||
api_key: str,
|
||||
cache: Dict[str, object],
|
||||
sleep_seconds: float,
|
||||
stats: Optional[Dict[str, int]] = None,
|
||||
) -> Dict[str, object]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
|
||||
if not pos:
|
||||
@@ -195,6 +287,7 @@ def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object
|
||||
"key": api_key,
|
||||
},
|
||||
cache,
|
||||
stats,
|
||||
)
|
||||
if sleep_seconds:
|
||||
time.sleep(sleep_seconds)
|
||||
@@ -215,6 +308,7 @@ def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object
|
||||
"key": api_key,
|
||||
},
|
||||
cache,
|
||||
stats,
|
||||
)
|
||||
if sleep_seconds:
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
324
build_enriched_lexicon.py
Normal file
324
build_enriched_lexicon.py
Normal file
@@ -0,0 +1,324 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from build_babelnet_enrichment import BABELNET_OUTPUT_PATH
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
ENRICHED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_enriched.json")
|
||||
|
||||
TOPIC_DOMAIN_RULES: Dict[str, Dict[str, Tuple[str, ...]]] = {
|
||||
"transport": {
|
||||
"strong": (
|
||||
"TRANSPORT_AND_TRAVEL",
|
||||
"NAVIGATION_AND_AVIATION",
|
||||
),
|
||||
"weak": (
|
||||
"CRAFT_ENGINEERING_AND_TECHNOLOGY",
|
||||
"FARMING_FISHING_AND_HUNTING",
|
||||
),
|
||||
"negative": (
|
||||
"MEDIA_AND_PRESS",
|
||||
"PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR",
|
||||
"RELIGION_MYSTICISM_AND_MYTHOLOGY",
|
||||
"CHEMISTRY_AND_MINERALOGY",
|
||||
),
|
||||
},
|
||||
"health": {
|
||||
"strong": ("HEALTH_AND_MEDICINE",),
|
||||
"weak": ("BIOLOGY",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"cinema": {
|
||||
"strong": ("MEDIA_AND_PRESS",),
|
||||
"weak": ("ART_ARCHITECTURE_AND_ARCHAEOLOGY",),
|
||||
"negative": ("HEALTH_AND_MEDICINE", "CHEMISTRY_AND_MINERALOGY"),
|
||||
},
|
||||
"nature": {
|
||||
"strong": (
|
||||
"BIOLOGY",
|
||||
"ANIMALS",
|
||||
"PLANTS",
|
||||
"EARTH",
|
||||
"METEOROLOGY",
|
||||
),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"ecology": {
|
||||
"strong": ("BIOLOGY", "EARTH", "METEOROLOGY"),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
}
|
||||
|
||||
TOPIC_TEXT_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"aereo",
|
||||
"auto",
|
||||
"autobus",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"trasport",
|
||||
"treno",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"health": ("cura", "malato", "medic", "ospedale", "paziente", "salute", "soccorso"),
|
||||
"cinema": ("attore", "cinema", "film", "pellicola", "regia", "spettacolo"),
|
||||
"nature": ("acqua", "animale", "bosco", "fiore", "mare", "montagna", "pianta", "terra"),
|
||||
"ecology": ("ambiente", "ecologia", "inquinamento", "natura", "sostenibile"),
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fonde lexicon_it_semantic.json con gli arricchimenti BabelNet gia disponibili."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="File con arricchimenti BabelNet parziali.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico arricchito da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale da usare per scegliere il synset BabelNet migliore.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def topic_candidates(entry: Dict[str, object], requested_topic: Optional[str]) -> List[str]:
|
||||
topics = [str(topic).lower() for topic in entry.get("topics", []) if topic]
|
||||
if requested_topic:
|
||||
topics.insert(0, requested_topic.lower())
|
||||
return [topic for topic in dedupe(topics) if topic != "general"]
|
||||
|
||||
|
||||
def synset_text(synset: Dict[str, object]) -> str:
|
||||
fields = []
|
||||
fields.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
return " ".join(fields).lower()
|
||||
|
||||
|
||||
def score_synset_for_topic(synset: Dict[str, object], topic: str) -> int:
|
||||
score = 0
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
|
||||
score += 60 * len(domains.intersection(rules.get("strong", ())))
|
||||
score += 25 * len(domains.intersection(rules.get("weak", ())))
|
||||
score -= 35 * len(domains.intersection(rules.get("negative", ())))
|
||||
|
||||
text = synset_text(synset)
|
||||
for keyword in TOPIC_TEXT_KEYWORDS.get(topic, ()):
|
||||
if keyword in text:
|
||||
score += 12
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_synset(
|
||||
babelnet: Dict[str, object], entry: Dict[str, object], requested_topic: Optional[str]
|
||||
) -> Tuple[Optional[Dict[str, object]], Dict[str, int]]:
|
||||
synsets = [item for item in babelnet.get("synsets", []) or [] if isinstance(item, dict)]
|
||||
topics = topic_candidates(entry, requested_topic)
|
||||
if not synsets:
|
||||
return None, {}
|
||||
|
||||
if not topics:
|
||||
best_synset = synsets[0]
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": None,
|
||||
"topic_score": 0,
|
||||
"strong_topic": False,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, {}
|
||||
|
||||
topic_scores: Dict[str, int] = {}
|
||||
best_synset = None
|
||||
best_topic = None
|
||||
best_score = -10_000
|
||||
|
||||
for topic in topics:
|
||||
topic_best = max(score_synset_for_topic(synset, topic) for synset in synsets)
|
||||
topic_scores[topic] = topic_best
|
||||
for synset in synsets:
|
||||
score = score_synset_for_topic(synset, topic)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_topic = topic
|
||||
best_synset = synset
|
||||
|
||||
if not best_synset:
|
||||
return None, topic_scores
|
||||
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": best_topic,
|
||||
"topic_score": best_score,
|
||||
"strong_topic": best_score >= 40,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, topic_scores
|
||||
|
||||
|
||||
def normalize_babelnet_status(
|
||||
entry: Dict[str, object], babelnet_entry: Optional[Dict[str, object]], requested_topic: Optional[str]
|
||||
) -> Dict[str, object]:
|
||||
if not babelnet_entry:
|
||||
return {"status": "not_requested"}
|
||||
|
||||
raw_babelnet = babelnet_entry.get("babelnet", {})
|
||||
if not isinstance(raw_babelnet, dict):
|
||||
return {"status": "api_error", "reason": "invalid_babelnet_payload"}
|
||||
|
||||
if not raw_babelnet.get("matched"):
|
||||
return {
|
||||
"status": "no_match",
|
||||
"matched": False,
|
||||
"reason": raw_babelnet.get("reason", "no_synsets"),
|
||||
"synsets": [],
|
||||
}
|
||||
|
||||
best_synset, topic_scores = choose_best_synset(raw_babelnet, entry, requested_topic)
|
||||
status = "enriched"
|
||||
if best_synset and int(best_synset.get("topic_score", 0)) <= 0:
|
||||
status = "ambiguous"
|
||||
selected_synset_id = best_synset.get("id") if best_synset else None
|
||||
selected_topic = best_synset.get("topic") if best_synset else None
|
||||
topic_score = int(best_synset.get("topic_score", 0)) if best_synset else 0
|
||||
strong_topic = bool(best_synset.get("strong_topic", False)) if best_synset else False
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"matched": True,
|
||||
"selected_synset_id": selected_synset_id,
|
||||
"selected_topic": selected_topic,
|
||||
"topic_score": topic_score,
|
||||
"strong_topic": strong_topic,
|
||||
"synset_refs": raw_babelnet.get("synset_refs", []),
|
||||
"synsets": raw_babelnet.get("synsets", []),
|
||||
"topic_scores": topic_scores,
|
||||
"best_synset": best_synset,
|
||||
"source_generated_at": babelnet_entry.get("babelnet_generated_at"),
|
||||
}
|
||||
|
||||
|
||||
def build_babelnet_index(payload: Dict[str, object]) -> Dict[Tuple[str, str], Dict[str, object]]:
|
||||
index = {}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
index[entry_key(entry)] = entry
|
||||
return index
|
||||
|
||||
|
||||
def build_enriched_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
semantic_payload = load_json(args.semantic, {})
|
||||
if not isinstance(semantic_payload, dict) or "entries" not in semantic_payload:
|
||||
raise ValueError(f"Lessico semantico non valido: {args.semantic}")
|
||||
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
babelnet_index = build_babelnet_index(babelnet_payload)
|
||||
enriched_entries = []
|
||||
status_counts: Dict[str, int] = {}
|
||||
|
||||
for entry in semantic_payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
enriched = deepcopy(entry)
|
||||
babelnet_entry = babelnet_index.get(entry_key(enriched))
|
||||
enriched["babelnet"] = normalize_babelnet_status(enriched, babelnet_entry, args.topic)
|
||||
status = str(enriched["babelnet"].get("status", "unknown"))
|
||||
status_counts[status] = status_counts.get(status, 0) + 1
|
||||
enriched_entries.append(enriched)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": semantic_payload.get("meta", {}).get("language", "it"),
|
||||
"version": 1,
|
||||
"base_lexicon": args.semantic.name,
|
||||
"babelnet_source": args.babelnet.name if args.babelnet.exists() else None,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"requested_topic": args.topic,
|
||||
"entry_count": len(enriched_entries),
|
||||
"babelnet_status_counts": status_counts,
|
||||
},
|
||||
"entries": enriched_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_enriched_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico arricchito generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Stati BabelNet: {payload['meta']['babelnet_status_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
429
build_llm_rescue_patch.py
Normal file
429
build_llm_rescue_patch.py
Normal file
@@ -0,0 +1,429 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
PATCH_OUTPUT_PATH = Path(__file__).with_name("llm_rescue_patch.json")
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """Sei un lessicografo italiano che prepara definizioni sintetiche per cruciverba.
|
||||
Ricevi un lemma con parte del discorso e contesto semantico parziale.
|
||||
Devi proporre una definizione breve in italiano, topic plausibili e tag semantici.
|
||||
|
||||
Regole:
|
||||
- Rispondi solo con JSON valido.
|
||||
- La definizione deve essere concisa, naturale e utile per un cruciverba.
|
||||
- Evita di includere il lemma o derivati ovvi del lemma nella definizione.
|
||||
- Se il termine sembra raro, ambiguo, refuso o poco affidabile, abbassa la confidenza e segnala needs_human_review=true.
|
||||
- I topic devono essere pochi, in inglese semplice minuscolo con underscore se serve.
|
||||
- I semantic_tags devono essere pochi, descrittivi e in italiano o inglese semplice.
|
||||
- Non inventare dettagli enciclopedici troppo specifici se non supportati dal contesto.
|
||||
|
||||
Formato JSON obbligatorio:
|
||||
{
|
||||
"definition": "...",
|
||||
"topics": ["topic1", "topic2"],
|
||||
"semantic_tags": ["tag1", "tag2"],
|
||||
"confidence": 0.0,
|
||||
"needs_human_review": true,
|
||||
"notes": "..."
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Costruisce una patch di rescue lessicale usando un LLM su un lotto di voci "
|
||||
"prioritarie tratte da to_be_review_priority.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=PRIORITY_INPUT_PATH,
|
||||
help="File to_be_review_priority.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PATCH_OUTPUT_PATH,
|
||||
help="Patch JSON da generare o aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Numero massimo di voci da processare nel lotto. Usa 0 per tutte le voci selezionate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket",
|
||||
default="red",
|
||||
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
choices=("openai_compatible", "ollama"),
|
||||
default="openai_compatible",
|
||||
help="Tipo di endpoint LLM da usare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-base",
|
||||
default="",
|
||||
help=(
|
||||
"Endpoint API. Per openai_compatible: .../v1/chat/completions. "
|
||||
"Per ollama: .../api/chat."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key-env",
|
||||
default="OPENAI_API_KEY",
|
||||
help="Nome della variabile d'ambiente che contiene la API key.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="gpt-4.1-mini",
|
||||
help="Nome del modello da interrogare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Temperatura della richiesta LLM.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Pausa tra una richiesta e la successiva.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci gia presenti nell'output con status drafted/reviewed/done.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Non chiama alcun LLM: prepara solo il lotto e marca le voci come selected.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def build_record(entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
wiktextract = entry.get("wiktextract") or {}
|
||||
wiktextract_defs = wiktextract.get("definitions") if isinstance(wiktextract, dict) else []
|
||||
babelnet_best = entry.get("babelnet_best_synset") or {}
|
||||
babelnet_glosses = babelnet_best.get("glosses") if isinstance(babelnet_best, dict) else []
|
||||
return {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"priority_bucket": entry.get("priority_bucket"),
|
||||
"priority_score": entry.get("priority_score"),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_topics": entry.get("topics", []),
|
||||
"current_definition": entry.get("preferred_definition", ""),
|
||||
"current_source": entry.get("preferred_source", ""),
|
||||
"context": {
|
||||
"topic_suggestions": entry.get("topic_suggestions", []),
|
||||
"semantic_glosses": entry.get("semantic_glosses", []),
|
||||
"senses": entry.get("senses", []),
|
||||
"wiktextract_definitions": wiktextract_defs or [],
|
||||
"wiktextract_topic_hints": wiktextract.get("topic_hints", []) if isinstance(wiktextract, dict) else [],
|
||||
"babelnet_glosses": babelnet_glosses or [],
|
||||
},
|
||||
"rescue_definition": "",
|
||||
"rescue_source": "",
|
||||
"rescue_topics": [],
|
||||
"rescue_semantic_tags": [],
|
||||
"rescue_notes": "",
|
||||
"confidence": 0.0,
|
||||
"needs_human_review": True,
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
|
||||
def build_user_prompt(entry: Dict[str, Any]) -> str:
|
||||
context = entry.get("context") or {}
|
||||
payload = {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"current_topics": entry.get("current_topics", []),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_definition": entry.get("current_definition", ""),
|
||||
"context": context,
|
||||
}
|
||||
return (
|
||||
"Genera una proposta di rescue lessicale per questa voce italiana.\n"
|
||||
"Se il termine sembra un refuso o una variante dubbia, segnalalo nelle notes.\n"
|
||||
"Payload:\n"
|
||||
f"{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
|
||||
|
||||
def resolve_api_base(args: argparse.Namespace) -> str:
|
||||
if args.api_base:
|
||||
return args.api_base
|
||||
if args.provider == "ollama":
|
||||
return "http://localhost:11434/api/chat"
|
||||
return "https://api.openai.com/v1/chat/completions"
|
||||
|
||||
|
||||
def request_openai_compatible(
|
||||
api_base: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
temperature: float,
|
||||
user_prompt: str,
|
||||
) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
}
|
||||
request = urllib.request.Request(
|
||||
api_base,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=90) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"OpenAI-compatible HTTP {exc.code}: {detail}") from exc
|
||||
return str(body["choices"][0]["message"]["content"]).strip()
|
||||
|
||||
|
||||
def request_ollama(
|
||||
api_base: str,
|
||||
model: str,
|
||||
temperature: float,
|
||||
user_prompt: str,
|
||||
) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
}
|
||||
request = urllib.request.Request(
|
||||
api_base,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=90) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"Ollama HTTP {exc.code}: {detail}") from exc
|
||||
return str((body.get("message") or {}).get("content", "")).strip()
|
||||
|
||||
|
||||
def extract_json_object(text: str) -> Dict[str, Any]:
|
||||
text = text.strip()
|
||||
start = text.find("{")
|
||||
end = text.rfind("}")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
raise ValueError("Risposta LLM senza oggetto JSON riconoscibile.")
|
||||
return json.loads(text[start : end + 1])
|
||||
|
||||
|
||||
def normalize_llm_payload(payload: Dict[str, Any], model: str) -> Dict[str, Any]:
|
||||
topics = payload.get("topics")
|
||||
tags = payload.get("semantic_tags")
|
||||
confidence = payload.get("confidence", 0.0)
|
||||
return {
|
||||
"rescue_definition": str(payload.get("definition", "")).strip(),
|
||||
"rescue_source": f"llm_rescue:{model}",
|
||||
"rescue_topics": [str(item).strip().lower() for item in (topics or []) if str(item).strip()],
|
||||
"rescue_semantic_tags": [str(item).strip() for item in (tags or []) if str(item).strip()],
|
||||
"rescue_notes": str(payload.get("notes", "")).strip(),
|
||||
"confidence": max(0.0, min(1.0, float(confidence or 0.0))),
|
||||
"needs_human_review": bool(payload.get("needs_human_review", True)),
|
||||
"status": "drafted",
|
||||
}
|
||||
|
||||
|
||||
def should_skip_existing(entry: Dict[str, Any]) -> bool:
|
||||
return str(entry.get("status", "")).lower() in {"drafted", "reviewed", "done"}
|
||||
|
||||
|
||||
def generate_patch(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
source_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(source_payload, dict):
|
||||
raise ValueError(f"File priority non valido: {args.input}")
|
||||
|
||||
output_payload = load_json(args.output, {"entries": []})
|
||||
if not isinstance(output_payload, dict):
|
||||
output_payload = {"entries": []}
|
||||
|
||||
existing_by_form = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in output_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
bucket = str(args.bucket or "red").strip().lower()
|
||||
source_entries = source_payload.get("practical_entries") or source_payload.get("entries") or []
|
||||
|
||||
max_items = int(args.limit)
|
||||
unlimited = max_items <= 0
|
||||
selected: List[Dict[str, Any]] = []
|
||||
skipped_preselection = 0
|
||||
for entry in source_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
||||
continue
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if not form:
|
||||
continue
|
||||
existing = existing_by_form.get(form)
|
||||
if args.skip_existing and existing and should_skip_existing(existing):
|
||||
skipped_preselection += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
if not unlimited and len(selected) >= max(1, max_items):
|
||||
break
|
||||
|
||||
api_base = resolve_api_base(args)
|
||||
api_key = os.environ.get(args.api_key_env, "") if args.provider == "openai_compatible" else ""
|
||||
if not args.dry_run and args.provider == "openai_compatible" and not api_key:
|
||||
raise RuntimeError(
|
||||
f"Variabile d'ambiente {args.api_key_env} non valorizzata per provider openai_compatible."
|
||||
)
|
||||
|
||||
merged_records: List[Dict[str, Any]] = []
|
||||
processed = 0
|
||||
skipped_existing = 0
|
||||
for source_entry in selected:
|
||||
form_key = str(source_entry.get("form", "")).strip().lower()
|
||||
existing = existing_by_form.get(form_key)
|
||||
record = dict(existing) if isinstance(existing, dict) else build_record(source_entry)
|
||||
|
||||
if args.skip_existing and existing and should_skip_existing(existing):
|
||||
skipped_existing += 1
|
||||
merged_records.append(record)
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
record["status"] = "selected"
|
||||
record["rescue_source"] = f"llm_rescue:{args.model}"
|
||||
merged_records.append(record)
|
||||
processed += 1
|
||||
continue
|
||||
|
||||
user_prompt = build_user_prompt(record)
|
||||
try:
|
||||
if args.provider == "ollama":
|
||||
raw_text = request_ollama(api_base, args.model, args.temperature, user_prompt)
|
||||
else:
|
||||
raw_text = request_openai_compatible(
|
||||
api_base,
|
||||
api_key,
|
||||
args.model,
|
||||
args.temperature,
|
||||
user_prompt,
|
||||
)
|
||||
llm_payload = extract_json_object(raw_text)
|
||||
record.update(normalize_llm_payload(llm_payload, args.model))
|
||||
except (urllib.error.URLError, TimeoutError, ValueError, json.JSONDecodeError, RuntimeError) as exc:
|
||||
record["rescue_source"] = f"llm_rescue:{args.model}"
|
||||
record["rescue_notes"] = f"errore_llm: {exc}"
|
||||
record["status"] = "error"
|
||||
record["needs_human_review"] = True
|
||||
merged_records.append(record)
|
||||
processed += 1
|
||||
print(
|
||||
f"[{processed}/{len(selected)}] {record.get('form')}: "
|
||||
f"status={record.get('status')} conf={record.get('confidence', 0.0)}"
|
||||
)
|
||||
if record.get("status") == "error" and record.get("rescue_notes"):
|
||||
print(f" dettaglio: {record.get('rescue_notes')}")
|
||||
if args.sleep > 0:
|
||||
time.sleep(args.sleep)
|
||||
|
||||
seen_forms = {str(item.get("form", "")).lower() for item in merged_records}
|
||||
for form_key, existing in existing_by_form.items():
|
||||
if form_key not in seen_forms:
|
||||
merged_records.append(existing)
|
||||
|
||||
merged_records.sort(
|
||||
key=lambda item: (
|
||||
{"pending": 0, "selected": 1, "error": 2, "drafted": 3, "reviewed": 4, "done": 5}.get(
|
||||
str(item.get("status", "pending")),
|
||||
9,
|
||||
),
|
||||
-int(item.get("priority_score", 0) or 0),
|
||||
str(item.get("form", "")),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_priority": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"batch_bucket": bucket,
|
||||
"batch_limit": int(args.limit),
|
||||
"provider": args.provider,
|
||||
"api_base": api_base,
|
||||
"model": args.model,
|
||||
"dry_run": bool(args.dry_run),
|
||||
"entry_count": len(merged_records),
|
||||
"processed_count": processed,
|
||||
"skipped_existing": skipped_existing,
|
||||
"skipped_preselection": skipped_preselection,
|
||||
},
|
||||
"entries": merged_records,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = generate_patch(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Patch LLM rescue generata: {args.output}")
|
||||
print(f"Voci nel file: {payload['meta']['entry_count']}")
|
||||
print(f"Voci processate in questo run: {payload['meta']['processed_count']}")
|
||||
print(f"Voci saltate per skip-existing: {payload['meta']['skipped_existing']}")
|
||||
print(f"Voci escluse gia in pre-selezione: {payload['meta']['skipped_preselection']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
182
build_review_priority.py
Normal file
182
build_review_priority.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
|
||||
REASON_WEIGHTS = {
|
||||
"no_viable_definition": 100,
|
||||
"proper_noun_collision": 90,
|
||||
"candidate_mentions_answer": 85,
|
||||
"function_word": 80,
|
||||
"very_short_word": 75,
|
||||
"wiktextract_missing": 55,
|
||||
"only_general_topics": 45,
|
||||
"flagged_by_refined_stage": 35,
|
||||
"unresolved_sense_topics": 30,
|
||||
"babelnet_ambiguous": 20,
|
||||
}
|
||||
|
||||
SOURCE_WEIGHTS = {
|
||||
"fallback": 50,
|
||||
"babelnet": 18,
|
||||
"semantic": 8,
|
||||
"wiktextract": 6,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PRIORITY_OUTPUT_PATH,
|
||||
help="File to_be_review_priority.json da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
|
||||
reasons = [str(item) for item in entry.get("review_reasons", []) or []]
|
||||
source = str(entry.get("preferred_source", "")).lower()
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
clue_definitions = entry.get("clue_definitions", {}) or {}
|
||||
form = str(entry.get("form", ""))
|
||||
|
||||
score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
|
||||
score += SOURCE_WEIGHTS.get(source, 0)
|
||||
|
||||
if not preferred_definition:
|
||||
score += 40
|
||||
|
||||
clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
|
||||
if clue_count == 0:
|
||||
score += 20
|
||||
elif clue_count == 1:
|
||||
score += 8
|
||||
|
||||
score += min(len(reasons), 5) * 3
|
||||
|
||||
if len(form) <= 2:
|
||||
score -= 120
|
||||
elif len(form) == 3:
|
||||
score -= 35
|
||||
|
||||
severe_count = sum(
|
||||
1
|
||||
for reason in reasons
|
||||
if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
)
|
||||
return (
|
||||
score,
|
||||
severe_count,
|
||||
int(source == "fallback"),
|
||||
-len(preferred_definition),
|
||||
str(entry.get("form", "")),
|
||||
)
|
||||
|
||||
|
||||
def priority_bucket(entry: Dict[str, object]) -> str:
|
||||
reasons = {str(item) for item in entry.get("review_reasons", []) or []}
|
||||
if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
|
||||
return "red"
|
||||
if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
|
||||
return "orange"
|
||||
return "yellow"
|
||||
|
||||
|
||||
def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
|
||||
score = score_tuple[0]
|
||||
compact = dict(entry)
|
||||
compact["priority_score"] = score
|
||||
compact["priority_bucket"] = priority_bucket(entry)
|
||||
return compact
|
||||
|
||||
|
||||
def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"File review non valido: {args.input}")
|
||||
|
||||
entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
ranked = sorted(
|
||||
entries,
|
||||
key=priority_score,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
if args.top > 0:
|
||||
ranked = ranked[: args.top]
|
||||
|
||||
compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
|
||||
|
||||
practical_entries = [
|
||||
item
|
||||
for item in compact_entries
|
||||
if len(str(item.get("form", ""))) > 2
|
||||
]
|
||||
|
||||
bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
|
||||
practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
|
||||
reason_counter = Counter()
|
||||
for item in compact_entries:
|
||||
for reason in item.get("review_reasons", []):
|
||||
reason_counter[str(reason)] += 1
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_review": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(compact_entries),
|
||||
"bucket_counts": dict(bucket_counter),
|
||||
"practical_entry_count": len(practical_entries),
|
||||
"practical_bucket_counts": dict(practical_bucket_counter),
|
||||
"top_reason_counts": dict(reason_counter.most_common(12)),
|
||||
},
|
||||
"entries": compact_entries,
|
||||
"practical_entries": practical_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_priority_review(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Review priority generato: {args.output}")
|
||||
print(f"Voci nel priority file: {payload['meta']['entry_count']}")
|
||||
print(f"Bucket: {payload['meta']['bucket_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
153
build_treccani_rescue_patch.py
Normal file
153
build_treccani_rescue_patch.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch "
|
||||
"manuale/assistita di rescue lessicale."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=PRIORITY_INPUT_PATH,
|
||||
help="File to_be_review_priority.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PATCH_OUTPUT_PATH,
|
||||
help="Patch JSON da generare o aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Numero massimo di voci da preparare nel lotto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket",
|
||||
default="red",
|
||||
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def build_record(entry: Dict[str, object]) -> Dict[str, object]:
|
||||
return {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"priority_bucket": entry.get("priority_bucket"),
|
||||
"priority_score": entry.get("priority_score"),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_topics": entry.get("topics", []),
|
||||
"current_definition": entry.get("preferred_definition", ""),
|
||||
"current_source": entry.get("preferred_source", ""),
|
||||
"rescue_definition": "",
|
||||
"rescue_source": "treccani_rescue",
|
||||
"rescue_topics": [],
|
||||
"rescue_semantic_tags": [],
|
||||
"rescue_notes": "",
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
|
||||
def build_patch(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError(f"File priority non valido: {args.input}")
|
||||
|
||||
existing_patch = load_json(args.output, {"entries": []})
|
||||
if not isinstance(existing_patch, dict):
|
||||
existing_patch = {"entries": []}
|
||||
|
||||
existing_by_form = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in existing_patch.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
bucket = str(args.bucket or "red").strip().lower()
|
||||
source_entries = payload.get("practical_entries") or payload.get("entries") or []
|
||||
|
||||
selected: List[Dict[str, object]] = []
|
||||
for entry in source_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
||||
continue
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if not form:
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= max(1, int(args.limit)):
|
||||
break
|
||||
|
||||
merged_records = []
|
||||
seen = set()
|
||||
for entry in selected:
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if form in existing_by_form:
|
||||
merged_records.append(existing_by_form[form])
|
||||
else:
|
||||
merged_records.append(build_record(entry))
|
||||
seen.add(form)
|
||||
|
||||
for form, entry in existing_by_form.items():
|
||||
if form not in seen:
|
||||
merged_records.append(entry)
|
||||
|
||||
merged_records.sort(
|
||||
key=lambda item: (
|
||||
{"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9),
|
||||
-int(item.get("priority_score", 0) or 0),
|
||||
str(item.get("form", "")),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_priority": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"batch_bucket": bucket,
|
||||
"batch_limit": int(args.limit),
|
||||
"entry_count": len(merged_records),
|
||||
},
|
||||
"entries": merged_records,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_patch(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Patch rescue generata: {args.output}")
|
||||
print(f"Voci nel lotto: {payload['meta']['entry_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
423
clue_generator.py
Normal file
423
clue_generator.py
Normal file
@@ -0,0 +1,423 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
|
||||
from crossword_generator import HORIZONTAL, Placement
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Clue:
|
||||
number: int
|
||||
word: str
|
||||
direction: str
|
||||
x: int
|
||||
y: int
|
||||
text: str
|
||||
source: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClueCandidate:
|
||||
text: str
|
||||
source: str
|
||||
family: str
|
||||
difficulty_hint: str
|
||||
topic_score: int
|
||||
strong_topic: bool
|
||||
|
||||
|
||||
DIFFICULTY_ALIASES = {
|
||||
"1": "easy",
|
||||
"2": "medium",
|
||||
"3": "hard",
|
||||
"4": "expert",
|
||||
"5": "expert",
|
||||
"easy": "easy",
|
||||
"medium": "medium",
|
||||
"hard": "hard",
|
||||
"expert": "expert",
|
||||
}
|
||||
|
||||
GENERIC_CLUE_PATTERNS = (
|
||||
"termine da ricavare dagli incroci",
|
||||
"termine lessicale collegato",
|
||||
"collegato a:",
|
||||
)
|
||||
|
||||
|
||||
def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
|
||||
payload = load_json(path, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
return {}
|
||||
return {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
|
||||
def normalize_difficulty(value: Optional[str]) -> str:
|
||||
return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
|
||||
|
||||
|
||||
def clean_definition(text: str, answer: str) -> str:
|
||||
clue = str(text or "")
|
||||
clue = re.sub(r"\[[^\]]*\]", " ", clue)
|
||||
clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
|
||||
if not clue:
|
||||
return ""
|
||||
clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
|
||||
clue = re.sub(r"\(\s*\)", "", clue)
|
||||
clue = re.sub(r"\s+,", ",", clue)
|
||||
clue = re.sub(r"\s+;", ";", clue)
|
||||
if clue and clue[0].islower():
|
||||
clue = clue[0].upper() + clue[1:]
|
||||
return clue + "."
|
||||
|
||||
|
||||
def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
|
||||
if not topic or topic == "general":
|
||||
return True
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
|
||||
if not strong_domains:
|
||||
return True
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
return bool(domains.intersection(strong_domains))
|
||||
|
||||
|
||||
def text_contains_answer(text: str, answer: str) -> bool:
|
||||
return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
|
||||
|
||||
|
||||
def directness_score(text: str) -> int:
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
direct_keywords = (
|
||||
"strumento",
|
||||
"veicolo",
|
||||
"animale",
|
||||
"pianta",
|
||||
"titolo",
|
||||
"edificio",
|
||||
"persona",
|
||||
"luogo",
|
||||
"malattia",
|
||||
"farmaco",
|
||||
"mezzo",
|
||||
"parte di",
|
||||
)
|
||||
for keyword in direct_keywords:
|
||||
if keyword in lowered:
|
||||
score += 8
|
||||
if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
|
||||
score += 4
|
||||
return score
|
||||
|
||||
|
||||
def preferred_length_range(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 24, 90
|
||||
if difficulty == "medium":
|
||||
return 20, 75
|
||||
if difficulty == "hard":
|
||||
return 16, 60
|
||||
return 14, 50
|
||||
|
||||
|
||||
def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
|
||||
text = candidate.text
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
|
||||
if not text or len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
|
||||
score -= 120
|
||||
|
||||
if text_contains_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 40
|
||||
|
||||
min_len, max_len = preferred_length_range(difficulty)
|
||||
length = len(text)
|
||||
if min_len <= length <= max_len:
|
||||
score += 28
|
||||
else:
|
||||
score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
|
||||
|
||||
directness = directness_score(text)
|
||||
if difficulty == "easy":
|
||||
score += directness * 2
|
||||
elif difficulty == "medium":
|
||||
score += directness
|
||||
elif difficulty == "hard":
|
||||
score -= max(0, directness - 6)
|
||||
else:
|
||||
score -= directness
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 56,
|
||||
"semantic_gloss": 34,
|
||||
"refined_sense": 30,
|
||||
"babelnet_best_gloss": 18,
|
||||
"babelnet_gloss": 10,
|
||||
"fallback": 0,
|
||||
}
|
||||
score += family_bonus.get(candidate.family, 0)
|
||||
|
||||
difficulty_pref = {
|
||||
"easy": {"direct", "balanced"},
|
||||
"medium": {"balanced", "direct"},
|
||||
"hard": {"balanced", "oblique"},
|
||||
"expert": {"oblique", "balanced"},
|
||||
}
|
||||
if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
|
||||
score += 18
|
||||
|
||||
if difficulty == "easy" and ";" in text:
|
||||
score += 8
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 8
|
||||
|
||||
if candidate.topic_score >= 40:
|
||||
score += 18
|
||||
elif candidate.topic_score > 0:
|
||||
score += 8
|
||||
elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
|
||||
score -= 140
|
||||
|
||||
if candidate.strong_topic:
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
|
||||
score -= 28
|
||||
|
||||
if length > 120:
|
||||
score -= 45
|
||||
if length > 180:
|
||||
score -= 90
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def candidate_hint(text: str, family: str) -> str:
|
||||
lowered = text.lower()
|
||||
if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
|
||||
return "oblique"
|
||||
if len(text) > 85:
|
||||
return "direct"
|
||||
return "balanced"
|
||||
|
||||
|
||||
def add_candidate(
|
||||
candidates: List[ClueCandidate],
|
||||
seen: set[Tuple[str, str]],
|
||||
*,
|
||||
text: str,
|
||||
answer: str,
|
||||
source: str,
|
||||
family: str,
|
||||
topic_score: int = 0,
|
||||
strong_topic: bool = False,
|
||||
) -> None:
|
||||
cleaned = clean_definition(text, answer)
|
||||
if not cleaned:
|
||||
return
|
||||
key = (cleaned.lower(), family)
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
candidates.append(
|
||||
ClueCandidate(
|
||||
text=cleaned,
|
||||
source=source,
|
||||
family=family,
|
||||
difficulty_hint=candidate_hint(cleaned, family),
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
semantic = entry.get("semantic", {})
|
||||
if not isinstance(semantic, dict):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(synset.get("definition", "")),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
)
|
||||
|
||||
for gloss in semantic.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_gloss",
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
topic_score = int(best_synset.get("topic_score", 0) or 0)
|
||||
strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
|
||||
for gloss in best_synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_best_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
|
||||
continue
|
||||
topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
|
||||
for gloss in synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=topic_score >= 40,
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
senses = entry.get("senses", [])
|
||||
if not isinstance(senses, list):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
for sense in senses:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
confidence = float(sense.get("confidence", 0.0) or 0.0)
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(sense.get("definition", "")),
|
||||
answer=answer,
|
||||
source=str(sense.get("source", "refined")),
|
||||
family="refined_sense",
|
||||
topic_score=int(confidence * 100),
|
||||
strong_topic=confidence >= 0.75,
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
def fallback_definition(entry: Dict[str, object], answer: str) -> str:
|
||||
pos = str(entry.get("pos", "")).lower()
|
||||
topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
|
||||
if topics:
|
||||
return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
|
||||
return "Termine da ricavare dagli incroci."
|
||||
|
||||
|
||||
def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
candidates: List[ClueCandidate] = []
|
||||
candidates.extend(semantic_candidates(entry, answer))
|
||||
candidates.extend(refined_sense_candidates(entry, answer))
|
||||
candidates.extend(babelnet_candidates(entry, answer, topic))
|
||||
return candidates
|
||||
|
||||
|
||||
def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, answer, difficulty),
|
||||
candidate.topic_score,
|
||||
len(candidate.text),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def definition_for_word(
|
||||
word: str,
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> tuple[str, str]:
|
||||
answer = word.lower()
|
||||
entry = entries.get(answer, {})
|
||||
if not entry:
|
||||
return "Termine da ricavare dagli incroci.", "fallback"
|
||||
|
||||
normalized_difficulty = normalize_difficulty(difficulty)
|
||||
candidates = all_candidates(entry, answer, topic)
|
||||
best = choose_candidate(candidates, answer, normalized_difficulty)
|
||||
if best:
|
||||
return best.text, best.source
|
||||
|
||||
return fallback_definition(entry, answer), "fallback"
|
||||
|
||||
|
||||
def generate_clues(
|
||||
placements: Iterable[Placement],
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> List[Clue]:
|
||||
clues = []
|
||||
for number, placement in enumerate(placements, start=1):
|
||||
text, source = definition_for_word(placement.word, entries, topic, difficulty)
|
||||
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
|
||||
clues.append(
|
||||
Clue(
|
||||
number=number,
|
||||
word=placement.word,
|
||||
direction=direction,
|
||||
x=placement.x,
|
||||
y=placement.y,
|
||||
text=text,
|
||||
source=source,
|
||||
)
|
||||
)
|
||||
return clues
|
||||
209
crossword_contract.md
Normal file
209
crossword_contract.md
Normal file
@@ -0,0 +1,209 @@
|
||||
# Contratto JSON del Cruciverba
|
||||
|
||||
Questo documento definisce il formato di scambio tra:
|
||||
|
||||
- `brain`: il motore che genera e compila il cruciverba
|
||||
- `client`: web app, backend, servizio PDF o altra macchina remota che richiede un cruciverba
|
||||
|
||||
L'obiettivo e' avere un payload:
|
||||
|
||||
- completo
|
||||
- stabile
|
||||
- espandibile
|
||||
- riusabile per stampa PDF, gioco web e archiviazione
|
||||
|
||||
## Flusso
|
||||
|
||||
1. Il client invia una `request` JSON al motore.
|
||||
2. Il motore risponde con una `response` JSON completa del cruciverba.
|
||||
3. Lo stesso JSON di risposta puo' essere:
|
||||
- salvato a database
|
||||
- convertito in PDF
|
||||
- renderizzato in una pagina web interattiva
|
||||
- riaperto in futuro senza rigenerare il cruciverba
|
||||
|
||||
## Principi di progettazione
|
||||
|
||||
- Ogni cruciverba ha un `crossword_id` univoco.
|
||||
- La `request` conserva i parametri di generazione originali.
|
||||
- La `response` include sia la griglia giocabile sia la soluzione.
|
||||
- Le parole hanno metadati ricchi: posizione, direzione, clue, tema, pos, fonte clue.
|
||||
- Le coordinate sono sempre assolute e 0-based nella griglia normalizzata esportata.
|
||||
- La griglia esportata e' rettangolare e normalizzata: niente coordinate negative.
|
||||
- Il formato supporta versioning con `schema_version`.
|
||||
|
||||
## Request
|
||||
|
||||
Campi principali:
|
||||
|
||||
- `schema_version`: versione del contratto
|
||||
- `request_id`: id della richiesta lato client
|
||||
- `requested_at`: timestamp ISO 8601
|
||||
- `generator`: configurazione del motore
|
||||
- `output`: preferenze di output
|
||||
- `client_context`: metadati opzionali del chiamante
|
||||
|
||||
### `generator`
|
||||
|
||||
- `topic`: stringa o lista di topic
|
||||
- `difficulty`: alias testuale
|
||||
- `seed`: opzionale, per riproducibilita'
|
||||
- `initial_word_count`
|
||||
- `themed_fill_count`
|
||||
- `target_empty_ratio`
|
||||
- `diffxy`
|
||||
- `time_limit_seconds`
|
||||
- `max_candidates_per_word`
|
||||
- `lexicon_file`
|
||||
- `definitions_enabled`
|
||||
- `definition_style`: per future varianti clue
|
||||
- `preferred_output_language`
|
||||
|
||||
### `output`
|
||||
|
||||
- `include_solution_grid`
|
||||
- `include_clue_sources`
|
||||
- `include_diagnostics`
|
||||
- `include_generation_log`
|
||||
- `format_hints`
|
||||
|
||||
## Response
|
||||
|
||||
Campi principali:
|
||||
|
||||
- `schema_version`
|
||||
- `request_id`
|
||||
- `crossword_id`
|
||||
- `generated_at`
|
||||
- `status`
|
||||
- `generator`
|
||||
- `summary`
|
||||
- `grid`
|
||||
- `entries`
|
||||
- `clues`
|
||||
- `solution`
|
||||
- `diagnostics`
|
||||
- `artifacts`
|
||||
|
||||
## Sezione `grid`
|
||||
|
||||
- `rows`
|
||||
- `cols`
|
||||
- `cell_size_hint`
|
||||
- `cells`
|
||||
|
||||
Ogni cella ha:
|
||||
|
||||
- `row`
|
||||
- `col`
|
||||
- `kind`: `block` oppure `letter`
|
||||
- `solution`
|
||||
- `display`
|
||||
- `number`: numero clue se la cella apre una parola
|
||||
- `across_entry_id`
|
||||
- `down_entry_id`
|
||||
- `is_prefilled`
|
||||
|
||||
Note:
|
||||
|
||||
- `solution` contiene sempre la lettera corretta per celle attive.
|
||||
- `display` e' vuoto per la scheda giocatore.
|
||||
- `number` serve per numerazione in stampa e web.
|
||||
|
||||
## Sezione `entries`
|
||||
|
||||
Ogni entry rappresenta una parola collocata in griglia.
|
||||
|
||||
Campi:
|
||||
|
||||
- `entry_id`
|
||||
- `number`
|
||||
- `direction`: `across` o `down`
|
||||
- `answer`
|
||||
- `answer_length`
|
||||
- `row`
|
||||
- `col`
|
||||
- `cells`: lista coordinate
|
||||
- `clue`
|
||||
- `clue_source`
|
||||
- `topics`
|
||||
- `pos`
|
||||
- `is_seed`
|
||||
- `added_by_filler`
|
||||
- `confidence`
|
||||
|
||||
## Sezione `clues`
|
||||
|
||||
Ridondante ma utile per client semplici.
|
||||
|
||||
- `across`: lista clues orizzontali
|
||||
- `down`: lista clues verticali
|
||||
|
||||
Ogni clue:
|
||||
|
||||
- `number`
|
||||
- `entry_id`
|
||||
- `text`
|
||||
- `enumeration`
|
||||
- `topic_match`
|
||||
- `source`
|
||||
|
||||
## Sezione `solution`
|
||||
|
||||
- `grid_rows`: lista di stringhe, una per riga
|
||||
- `words`: elenco risposte
|
||||
|
||||
`grid_rows` usa:
|
||||
|
||||
- lettera maiuscola per cella piena
|
||||
- `#` per casella nera
|
||||
|
||||
## Sezione `diagnostics`
|
||||
|
||||
Serve a tuning, benchmark e debug.
|
||||
|
||||
- `total_words`
|
||||
- `seed_words_requested`
|
||||
- `seed_words_placed`
|
||||
- `filler_words_added`
|
||||
- `intersections`
|
||||
- `filled_cells`
|
||||
- `empty_cells`
|
||||
- `empty_ratio`
|
||||
- `target_empty_ratio`
|
||||
- `topic_words`
|
||||
- `off_topic_words`
|
||||
- `pos_counts`
|
||||
- `runtime_lexicon`
|
||||
- `seed`
|
||||
- `generation_seconds`
|
||||
|
||||
## Sezione `artifacts`
|
||||
|
||||
URL o path futuri per file derivati.
|
||||
|
||||
- `pdf_player`
|
||||
- `pdf_solution`
|
||||
- `thumbnail`
|
||||
- `html_preview`
|
||||
|
||||
## Estensioni future previste
|
||||
|
||||
- `difficulty_profile`: facile/medio/difficile per definizioni separate
|
||||
- `hints`: aiuti progressivi per singola parola
|
||||
- `theme_story`: testo introduttivo del cruciverba
|
||||
- `player_state`: salvataggio partita in corso
|
||||
- `stats`: tempi, errori, percentuali di completamento
|
||||
|
||||
## Regola pratica consigliata
|
||||
|
||||
La macchina "brain" deve esporre almeno due endpoint logici:
|
||||
|
||||
- `POST /crosswords/generate`
|
||||
- input: request JSON
|
||||
- output: response JSON
|
||||
|
||||
- `GET /crosswords/{crossword_id}`
|
||||
- output: stessa response JSON salvata
|
||||
|
||||
In questo modo il contratto resta identico sia via file sia via webservice.
|
||||
37
crossword_contract_example_request.json
Normal file
37
crossword_contract_example_request.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"schema_version": "1.0",
|
||||
"request_id": "req-2026-04-28-0001",
|
||||
"requested_at": "2026-04-28T17:05:00+02:00",
|
||||
"generator": {
|
||||
"topic": [
|
||||
"transport"
|
||||
],
|
||||
"difficulty": "medium",
|
||||
"seed": 2,
|
||||
"initial_word_count": 19,
|
||||
"themed_fill_count": 10,
|
||||
"target_empty_ratio": 0.1667,
|
||||
"diffxy": 7,
|
||||
"time_limit_seconds": 8.0,
|
||||
"max_candidates_per_word": 12,
|
||||
"lexicon_file": "lexicon_it_curated_llm_aggressive.json",
|
||||
"definitions_enabled": true,
|
||||
"definition_style": "classic",
|
||||
"preferred_output_language": "it"
|
||||
},
|
||||
"output": {
|
||||
"include_solution_grid": true,
|
||||
"include_clue_sources": true,
|
||||
"include_diagnostics": true,
|
||||
"include_generation_log": false,
|
||||
"format_hints": {
|
||||
"pdf_page_size": "A4",
|
||||
"mobile_layout": true
|
||||
}
|
||||
},
|
||||
"client_context": {
|
||||
"channel": "web",
|
||||
"user_locale": "it-IT",
|
||||
"app_version": "alpha-1"
|
||||
}
|
||||
}
|
||||
138
crossword_contract_example_response.json
Normal file
138
crossword_contract_example_response.json
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"schema_version": "1.0",
|
||||
"request_id": "req-2026-04-28-0001",
|
||||
"crossword_id": "cw-2026-04-28-transport-0001",
|
||||
"generated_at": "2026-04-28T17:06:42+02:00",
|
||||
"status": "ok",
|
||||
"generator": {
|
||||
"topic": [
|
||||
"transport"
|
||||
],
|
||||
"difficulty": "medium",
|
||||
"seed": 2,
|
||||
"runtime_lexicon": "lexicon_it_curated_llm_aggressive.json"
|
||||
},
|
||||
"summary": {
|
||||
"title": "Cruciverba a tema trasporti",
|
||||
"subtitle": "Schema generato automaticamente",
|
||||
"rows": 12,
|
||||
"cols": 12,
|
||||
"total_words": 6,
|
||||
"intersections": 7
|
||||
},
|
||||
"grid": {
|
||||
"rows": 12,
|
||||
"cols": 12,
|
||||
"cell_size_hint": 32,
|
||||
"cells": [
|
||||
{
|
||||
"row": 0,
|
||||
"col": 0,
|
||||
"kind": "letter",
|
||||
"solution": "A",
|
||||
"display": "",
|
||||
"number": 1,
|
||||
"across_entry_id": "A1",
|
||||
"down_entry_id": null,
|
||||
"is_prefilled": false
|
||||
},
|
||||
{
|
||||
"row": 0,
|
||||
"col": 1,
|
||||
"kind": "letter",
|
||||
"solution": "M",
|
||||
"display": "",
|
||||
"number": null,
|
||||
"across_entry_id": "A1",
|
||||
"down_entry_id": "D2",
|
||||
"is_prefilled": false
|
||||
},
|
||||
{
|
||||
"row": 0,
|
||||
"col": 2,
|
||||
"kind": "block",
|
||||
"solution": null,
|
||||
"display": null,
|
||||
"number": null,
|
||||
"across_entry_id": null,
|
||||
"down_entry_id": null,
|
||||
"is_prefilled": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"entries": [
|
||||
{
|
||||
"entry_id": "A1",
|
||||
"number": 1,
|
||||
"direction": "across",
|
||||
"answer": "AMBULANZA",
|
||||
"answer_length": 9,
|
||||
"row": 0,
|
||||
"col": 0,
|
||||
"cells": [
|
||||
[0, 0],
|
||||
[0, 1],
|
||||
[0, 2]
|
||||
],
|
||||
"clue": "Veicolo di soccorso sanitario.",
|
||||
"clue_source": "semantic_definition",
|
||||
"topics": [
|
||||
"transport",
|
||||
"health"
|
||||
],
|
||||
"pos": "NOUN",
|
||||
"is_seed": true,
|
||||
"added_by_filler": false,
|
||||
"confidence": 0.95
|
||||
}
|
||||
],
|
||||
"clues": {
|
||||
"across": [
|
||||
{
|
||||
"number": 1,
|
||||
"entry_id": "A1",
|
||||
"text": "Veicolo di soccorso sanitario.",
|
||||
"enumeration": 9,
|
||||
"topic_match": true,
|
||||
"source": "semantic_definition"
|
||||
}
|
||||
],
|
||||
"down": []
|
||||
},
|
||||
"solution": {
|
||||
"grid_rows": [
|
||||
"AM#ULA######",
|
||||
"##B#########"
|
||||
],
|
||||
"words": [
|
||||
"AMBULANZA"
|
||||
]
|
||||
},
|
||||
"diagnostics": {
|
||||
"seed_words_requested": 19,
|
||||
"seed_words_placed": 19,
|
||||
"filler_words_added": 5,
|
||||
"filled_cells": 84,
|
||||
"empty_cells": 18,
|
||||
"empty_ratio": 0.1765,
|
||||
"target_empty_ratio": 0.1667,
|
||||
"topic_words": 21,
|
||||
"off_topic_words": 3,
|
||||
"pos_counts": {
|
||||
"sostantivi": 20,
|
||||
"aggettivi": 2,
|
||||
"verbi": 1,
|
||||
"avverbi": 0,
|
||||
"preposizioni": 0,
|
||||
"congiunzioni": 0,
|
||||
"altri": 1
|
||||
},
|
||||
"generation_seconds": 124.6
|
||||
},
|
||||
"artifacts": {
|
||||
"pdf_player": null,
|
||||
"pdf_solution": null,
|
||||
"thumbnail": null,
|
||||
"html_preview": null
|
||||
}
|
||||
}
|
||||
@@ -87,7 +87,12 @@ class CrosswordFiller:
|
||||
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
||||
self.vocabulary_metadata = vocabulary_metadata or {}
|
||||
self.semantic_metadata = semantic_metadata or {}
|
||||
self.selected_topic = selected_topic.strip().lower()
|
||||
self.selected_topics = [
|
||||
topic.strip().lower()
|
||||
for topic in selected_topic.split(",")
|
||||
if topic.strip()
|
||||
] or ["general"]
|
||||
self.selected_topic = self.selected_topics[0]
|
||||
self.max_themed_fill_words = max(0, max_themed_fill_words)
|
||||
self.seed = seed
|
||||
self.rng = random.Random(seed)
|
||||
@@ -333,7 +338,7 @@ class CrosswordFiller:
|
||||
return score
|
||||
|
||||
def _semantic_topic_score(self, word: str) -> int:
|
||||
if not self.selected_topic or self.selected_topic == "general":
|
||||
if not self.selected_topics or self.selected_topics == ["general"]:
|
||||
return 0
|
||||
|
||||
entry = self._semantic_entry(word)
|
||||
@@ -350,9 +355,9 @@ class CrosswordFiller:
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
|
||||
score = 0
|
||||
if self.selected_topic in topics:
|
||||
if any(topic in topics for topic in self.selected_topics):
|
||||
score += 4
|
||||
if self.selected_topic in semantic_topics:
|
||||
if any(topic in semantic_topics for topic in self.selected_topics):
|
||||
score += 6
|
||||
if "general" in topics:
|
||||
score += 1
|
||||
|
||||
611
curate_lexicon_alpha.py
Normal file
611
curate_lexicon_alpha.py
Normal file
@@ -0,0 +1,611 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
|
||||
TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
|
||||
DIFFICULTIES = ("easy", "medium", "hard", "expert")
|
||||
|
||||
TEXT_REPLACEMENTS = {
|
||||
" ngrandimento": " ingrandimento",
|
||||
"superificie": "superficie",
|
||||
"quantitaaa": "quantità",
|
||||
"quantitaaaa": "quantità",
|
||||
"quantit": "quantità",
|
||||
"sanit_militare": "sanità_militare",
|
||||
" unaparola ": " una parola ",
|
||||
"questa parola, ": "",
|
||||
"questa parola; ": "",
|
||||
}
|
||||
|
||||
SUSPICIOUS_PROPER_PATTERNS = (
|
||||
r"\bepisodio\b",
|
||||
r"\bfilm\b",
|
||||
r"\bserie tv\b",
|
||||
r"\bfamiglia\b",
|
||||
r"\bcomune italiano\b",
|
||||
r"\bfrazione del comune\b",
|
||||
r"\bcitta metropolitana\b",
|
||||
r"\bpersonaggio\b",
|
||||
r"\balbum\b",
|
||||
r"\bcognome\b",
|
||||
r"\bnome proprio\b",
|
||||
)
|
||||
|
||||
DOMAIN_HINTS = {
|
||||
"religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
|
||||
"transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
|
||||
"health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
|
||||
"nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
|
||||
"sea": ("acque", "salate", "superficie terrestre", "oceano"),
|
||||
}
|
||||
|
||||
ABSTRACT_PATTERNS = (
|
||||
r"\bgrande quantita\b",
|
||||
r"\bfigurato\b",
|
||||
r"\bsenso figurato\b",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=CURATED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico curated da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-output",
|
||||
type=Path,
|
||||
default=TO_BE_REVIEW_OUTPUT_PATH,
|
||||
help="File JSON con le voci che richiedono revisione umana.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-review",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
replacements = str.maketrans(
|
||||
{
|
||||
"à": "a",
|
||||
"á": "a",
|
||||
"è": "e",
|
||||
"é": "e",
|
||||
"ì": "i",
|
||||
"í": "i",
|
||||
"ò": "o",
|
||||
"ó": "o",
|
||||
"ù": "u",
|
||||
"ú": "u",
|
||||
}
|
||||
)
|
||||
return str(text).translate(replacements)
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
for old, new in TEXT_REPLACEMENTS.items():
|
||||
value = value.replace(old, new)
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
value = re.sub(r"\s*;\s*", "; ", value)
|
||||
value = re.sub(r"\s*,\s*", ", ", value)
|
||||
value = value.strip(" .;:-")
|
||||
if value and value[0].islower():
|
||||
value = value[0].upper() + value[1:]
|
||||
return value + "."
|
||||
|
||||
|
||||
def split_definition_text(text: str) -> List[str]:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return []
|
||||
pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
|
||||
normalized = []
|
||||
for piece in pieces:
|
||||
cleaned = normalize_text(piece)
|
||||
if cleaned:
|
||||
normalized.append(cleaned)
|
||||
return normalized
|
||||
|
||||
|
||||
def entry_is_common_word(entry: Dict[str, object]) -> bool:
|
||||
form = str(entry.get("form", ""))
|
||||
return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])
|
||||
|
||||
|
||||
def definition_mentions_answer(text: str, answer: str) -> bool:
|
||||
normalized_text = ascii_fold(text).lower()
|
||||
normalized_answer = ascii_fold(answer).lower()
|
||||
return bool(re.search(re.escape(normalized_answer), normalized_text))
|
||||
|
||||
|
||||
def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
|
||||
if not entry_is_common_word(entry):
|
||||
return False
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)
|
||||
|
||||
|
||||
def likely_abstract_detour(text: str) -> bool:
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)
|
||||
|
||||
|
||||
def semantic_topics(entry: Dict[str, object]) -> List[str]:
|
||||
semantic = entry.get("semantic", {})
|
||||
topics = []
|
||||
if isinstance(semantic, dict):
|
||||
topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
|
||||
return dedupe(topics)
|
||||
|
||||
|
||||
def lexical_topics(entry: Dict[str, object]) -> List[str]:
|
||||
return [str(item).lower() for item in entry.get("topics", []) or [] if item]
|
||||
|
||||
|
||||
def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
|
||||
lowered = ascii_fold(text).lower()
|
||||
score = 0
|
||||
topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
for topic in topics:
|
||||
for hint in DOMAIN_HINTS.get(topic, ()):
|
||||
if hint in lowered:
|
||||
score += 16
|
||||
return score
|
||||
|
||||
|
||||
def candidate_style(text: str) -> str:
|
||||
lowered = ascii_fold(text).lower()
|
||||
if ";" in text or len(text) > 90:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
|
||||
return "balanced"
|
||||
return "oblique"
|
||||
|
||||
|
||||
def length_window(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 18, 90
|
||||
if difficulty == "medium":
|
||||
return 18, 78
|
||||
if difficulty == "hard":
|
||||
return 14, 62
|
||||
return 12, 55
|
||||
|
||||
|
||||
def build_candidate(
|
||||
text: str,
|
||||
*,
|
||||
source: str,
|
||||
family: str,
|
||||
confidence: float,
|
||||
priority: int = 0,
|
||||
) -> Dict[str, object]:
|
||||
cleaned = normalize_text(text)
|
||||
return {
|
||||
"text": cleaned,
|
||||
"source": source,
|
||||
"family": family,
|
||||
"confidence": confidence,
|
||||
"style": candidate_style(cleaned),
|
||||
"priority": priority,
|
||||
}
|
||||
|
||||
|
||||
def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
|
||||
candidates: List[Dict[str, object]] = []
|
||||
seen = set()
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for index, synset in enumerate(semantic.get("synsets", []) or []):
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(synset.get("definition", ""))):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
confidence=0.9,
|
||||
priority=max(0, 100 - index * 12),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
for index, gloss in enumerate(semantic.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic_gloss",
|
||||
family="semantic_gloss",
|
||||
confidence=0.8,
|
||||
priority=max(0, 90 - index * 10),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
for index, sense in enumerate(entry.get("senses", []) or []):
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(sense.get("definition", ""))):
|
||||
source = str(sense.get("source", "refined"))
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="refined" if source == "semantic" else source,
|
||||
family="refined_sense",
|
||||
confidence=float(sense.get("confidence", 0.7) or 0.7),
|
||||
priority=max(0, 80 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
|
||||
for index, gloss in enumerate(best_synset.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
confidence=confidence,
|
||||
priority=max(0, 60 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
definitions = wiktextract.get("definitions", []) or []
|
||||
confidence = 0.78 if wiktextract.get("matched") else 0.45
|
||||
for index, definition in enumerate(definitions):
|
||||
for piece in split_definition_text(str(definition)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="wiktextract",
|
||||
family="wiktextract_definition",
|
||||
confidence=confidence,
|
||||
priority=max(0, 88 - index * 9),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
|
||||
text = str(candidate["text"])
|
||||
answer = str(entry.get("form", "")).lower()
|
||||
score = 0
|
||||
|
||||
source = str(candidate.get("source"))
|
||||
family = str(candidate.get("family"))
|
||||
confidence = float(candidate.get("confidence", 0.0) or 0.0)
|
||||
|
||||
if len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if definition_mentions_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 30
|
||||
|
||||
if suspicious_proper_noun_definition(text, entry):
|
||||
score -= 220
|
||||
|
||||
if likely_abstract_detour(text):
|
||||
score -= 80
|
||||
|
||||
min_len, max_len = length_window(difficulty)
|
||||
if min_len <= len(text) <= max_len:
|
||||
score += 24
|
||||
else:
|
||||
score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2
|
||||
|
||||
source_bonus = {
|
||||
"semantic": 55,
|
||||
"semantic_gloss": 40,
|
||||
"babelnet": 24,
|
||||
"refined": 30,
|
||||
"wiktextract": 52,
|
||||
}
|
||||
score += source_bonus.get(source, 10)
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 30,
|
||||
"semantic_gloss": 18,
|
||||
"babelnet_gloss": 8,
|
||||
"refined_sense": 22,
|
||||
"wiktextract_definition": 28,
|
||||
}
|
||||
score += family_bonus.get(family, 0)
|
||||
|
||||
score += int(candidate.get("priority", 0) or 0)
|
||||
score += int(confidence * 35)
|
||||
|
||||
alignment = topic_alignment_score(text, entry)
|
||||
score += alignment
|
||||
topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
|
||||
if concrete_topics and alignment == 0:
|
||||
score -= 45
|
||||
|
||||
style = str(candidate.get("style"))
|
||||
if difficulty == "easy" and style == "direct":
|
||||
score += 16
|
||||
elif difficulty == "medium" and style in {"direct", "balanced"}:
|
||||
score += 14
|
||||
elif difficulty == "hard" and style == "balanced":
|
||||
score += 10
|
||||
elif difficulty == "expert" and style == "oblique":
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
|
||||
score -= 30
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 10
|
||||
|
||||
if entry.get("needs_review"):
|
||||
score -= 8
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_candidate(
|
||||
candidates: Sequence[Dict[str, object]],
|
||||
entry: Dict[str, object],
|
||||
difficulty: str,
|
||||
) -> Optional[Dict[str, object]]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, entry, difficulty),
|
||||
float(candidate.get("confidence", 0.0)),
|
||||
float(candidate.get("priority", 0.0)),
|
||||
-len(str(candidate.get("text", ""))),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
|
||||
reasons: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
lowered_topics = set(lexical_topics(entry))
|
||||
semantic_topic_set = set(semantic_topics(entry))
|
||||
babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
preferred_source = str(entry.get("preferred_source", ""))
|
||||
|
||||
if not candidates:
|
||||
reasons.append("no_viable_definition")
|
||||
if not preferred_definition and entry.get("needs_review"):
|
||||
reasons.append("flagged_by_refined_stage")
|
||||
if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
|
||||
reasons.append("proper_noun_collision")
|
||||
if babelnet_status == "ambiguous" and preferred_source == "babelnet":
|
||||
reasons.append("babelnet_ambiguous")
|
||||
if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
|
||||
reasons.append("wiktextract_missing")
|
||||
if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
|
||||
reasons.append("only_general_topics")
|
||||
if len(form) <= 2:
|
||||
reasons.append("very_short_word")
|
||||
if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
|
||||
reasons.append("function_word")
|
||||
if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
|
||||
reasons.append("unresolved_sense_topics")
|
||||
if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
|
||||
reasons.append("candidate_mentions_answer")
|
||||
|
||||
return dedupe(reasons)
|
||||
|
||||
|
||||
def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
|
||||
curated = deepcopy(entry)
|
||||
candidates = collect_candidates(curated)
|
||||
|
||||
clue_definitions: Dict[str, str] = {}
|
||||
clue_sources: Dict[str, str] = {}
|
||||
clue_scores: Dict[str, int] = {}
|
||||
curation_notes: List[str] = []
|
||||
|
||||
for difficulty in DIFFICULTIES:
|
||||
best = choose_best_candidate(candidates, curated, difficulty)
|
||||
if best:
|
||||
clue_definitions[difficulty] = str(best["text"])
|
||||
clue_sources[difficulty] = str(best["source"])
|
||||
clue_scores[difficulty] = score_candidate(best, curated, difficulty)
|
||||
|
||||
preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
|
||||
preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"
|
||||
|
||||
if preferred_definition:
|
||||
curation_notes.append(f"preferred_from={preferred_source}")
|
||||
if clue_scores.get("medium", -9999) < 20:
|
||||
curation_notes.append("weak_medium_definition")
|
||||
|
||||
curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
|
||||
curated["curated_senses"] = [
|
||||
{
|
||||
"definition": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates
|
||||
]
|
||||
curated["preferred_definition"] = preferred_definition
|
||||
curated["preferred_source"] = preferred_source
|
||||
curated["clue_definitions"] = clue_definitions
|
||||
curated["clue_sources"] = clue_sources
|
||||
curated["clue_scores"] = clue_scores
|
||||
curated["curation_notes"] = curation_notes
|
||||
|
||||
reasons = review_reasons(curated, candidates)
|
||||
severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
|
||||
curated["alpha_ready"] = alpha_ready
|
||||
curated["review_reasons"] = reasons
|
||||
|
||||
review_item = None
|
||||
if reasons:
|
||||
review_item = {
|
||||
"form": curated.get("form"),
|
||||
"lemma": curated.get("lemma"),
|
||||
"pos": curated.get("pos"),
|
||||
"topics": curated.get("topics"),
|
||||
"topic_suggestions": curated.get("topic_suggestions"),
|
||||
"preferred_definition": preferred_definition,
|
||||
"preferred_source": preferred_source,
|
||||
"clue_definitions": clue_definitions,
|
||||
"review_reasons": reasons,
|
||||
"semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
|
||||
"senses": curated.get("senses", []),
|
||||
"babelnet_status": (curated.get("babelnet") or {}).get("status"),
|
||||
"babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
|
||||
"wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
|
||||
"wiktextract": curated.get("wiktextract"),
|
||||
"candidate_pool": [
|
||||
{
|
||||
"text": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates[:12]
|
||||
],
|
||||
}
|
||||
|
||||
return curated, review_item
|
||||
|
||||
|
||||
def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
curated_entries: List[Dict[str, object]] = []
|
||||
review_entries: List[Dict[str, object]] = []
|
||||
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
curated, review_item = curate_entry(entry)
|
||||
curated_entries.append(curated)
|
||||
if review_item:
|
||||
review_entries.append(review_item)
|
||||
|
||||
if args.max_review > 0:
|
||||
review_entries = review_entries[: args.max_review]
|
||||
|
||||
curated_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(curated_entries),
|
||||
"alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
|
||||
"review_count": len(review_entries),
|
||||
},
|
||||
"entries": curated_entries,
|
||||
}
|
||||
|
||||
review_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(review_entries),
|
||||
},
|
||||
"entries": review_entries,
|
||||
}
|
||||
|
||||
return curated_payload, review_payload
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
curated_payload, review_payload = build_curated_lexicon(args)
|
||||
write_json(args.output, curated_payload)
|
||||
write_json(args.review_output, review_payload)
|
||||
print(f"Lessico curated generato: {args.output}")
|
||||
print(f"Voci totali: {curated_payload['meta']['entry_count']}")
|
||||
print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
|
||||
print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
|
||||
print(f"File review generato: {args.review_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
492
enrich_review_from_wiktextract_file.py
Normal file
492
enrich_review_from_wiktextract_file.py
Normal file
@@ -0,0 +1,492 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
WIKTEXTRACT_INPUT_PATH = Path(__file__).with_name("raw-wiktextract-data.jsonl")
|
||||
WIKTEXTRACT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktextract.json")
|
||||
WIKTEXTRACT_INDEX_CACHE_PATH = Path(__file__).with_name(".wiktextract_it_index.json")
|
||||
|
||||
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
|
||||
|
||||
POS_MAP = {
|
||||
"noun": "NOUN",
|
||||
"adj": "ADJ",
|
||||
"adj": "ADJ",
|
||||
"verb": "VERB",
|
||||
"adv": "ADV",
|
||||
"prep": "PREP",
|
||||
"conj": "CONJ",
|
||||
"pron": "PRON",
|
||||
"intj": "INTJ",
|
||||
}
|
||||
|
||||
TOPIC_MAP = {
|
||||
"christianity": "religion",
|
||||
"religion": "religion",
|
||||
"history": "history",
|
||||
"agriculture": "agriculture",
|
||||
"engineering": "technology",
|
||||
"mechanics": "technology",
|
||||
"technology": "technology",
|
||||
"medicine": "health",
|
||||
"geography": "geography",
|
||||
"biology": "nature",
|
||||
"aeronautics": "transport",
|
||||
}
|
||||
|
||||
CATEGORY_TOPIC_HINTS = {
|
||||
"religione-it": "religion",
|
||||
"cristianesimo-it": "religion",
|
||||
"storia-it": "history",
|
||||
"agricoltura-it": "agriculture",
|
||||
"medicina-it": "health",
|
||||
"ingegneria-it": "technology",
|
||||
"meccanica-it": "technology",
|
||||
"tecnologia-it": "technology",
|
||||
"geografia-it": "geography",
|
||||
"biologia-it": "nature",
|
||||
"aeronautica-it": "transport",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce il lessico refined leggendo offline il file raw-wiktextract-data.jsonl, "
|
||||
"senza effettuare richieste di rete."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json da usare per selezionare i lemmi prioritari.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wiktextract",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_INPUT_PATH,
|
||||
help="File JSONL raw estratto da Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_OUTPUT_PATH,
|
||||
help="Lessico refined con blocco wiktextract aggiunto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-cache",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_INDEX_CACHE_PATH,
|
||||
help="Cache dell'indice lemmi->righe del JSONL per velocizzare i rilanci.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
default="",
|
||||
help="Lista separata da virgole di lemmi specifici da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-reasons",
|
||||
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
|
||||
help="Motivi del file review da trattare con priorita, separati da virgole.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci che nel lessico di input hanno gia un blocco wiktextract utile.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_csv_set(value: str) -> set[str]:
|
||||
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def load_or_build_index(jsonl_path: Path, index_cache_path: Path) -> Dict[str, List[int]]:
|
||||
cached = load_json(index_cache_path, {})
|
||||
expected_meta = {
|
||||
"source": str(jsonl_path.resolve()),
|
||||
"size": jsonl_path.stat().st_size if jsonl_path.exists() else 0,
|
||||
"mtime": jsonl_path.stat().st_mtime if jsonl_path.exists() else 0,
|
||||
}
|
||||
if (
|
||||
isinstance(cached, dict)
|
||||
and cached.get("meta") == expected_meta
|
||||
and isinstance(cached.get("index"), dict)
|
||||
):
|
||||
return {str(key): list(value) for key, value in cached["index"].items()}
|
||||
|
||||
index: Dict[str, List[int]] = {}
|
||||
with jsonl_path.open("r", encoding="utf-8") as handle:
|
||||
while True:
|
||||
offset = handle.tell()
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
break
|
||||
raw = line.rstrip("\n")
|
||||
if not raw:
|
||||
continue
|
||||
obj = json.loads(raw)
|
||||
if obj.get("lang_code") != "it":
|
||||
continue
|
||||
word = str(obj.get("word", "")).strip().lower()
|
||||
if word:
|
||||
index.setdefault(word, []).append(offset)
|
||||
|
||||
write_json(index_cache_path, {"meta": expected_meta, "index": index})
|
||||
return index
|
||||
|
||||
|
||||
def read_jsonl_objects_at_offsets(jsonl_path: Path, offsets: Sequence[int]) -> List[Dict[str, object]]:
|
||||
objects: List[Dict[str, object]] = []
|
||||
with jsonl_path.open("r", encoding="utf-8") as handle:
|
||||
for offset in offsets:
|
||||
handle.seek(offset)
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
continue
|
||||
objects.append(json.loads(line))
|
||||
return objects
|
||||
|
||||
|
||||
def map_pos(value: str) -> str:
|
||||
normalized = str(value or "").strip().lower()
|
||||
return POS_MAP.get(normalized, normalized.upper() if normalized else "")
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
value = str(text or "").strip()
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value
|
||||
|
||||
|
||||
def sense_topics(sense: Dict[str, object], categories: Sequence[str]) -> List[str]:
|
||||
topics = set()
|
||||
for topic in sense.get("topics", []) or []:
|
||||
normalized = TOPIC_MAP.get(str(topic).strip().lower())
|
||||
if normalized:
|
||||
topics.add(normalized)
|
||||
for category in categories:
|
||||
normalized = CATEGORY_TOPIC_HINTS.get(str(category).strip().lower())
|
||||
if normalized:
|
||||
topics.add(normalized)
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def word_level_topics(entries: Sequence[Dict[str, object]], categories: Sequence[str]) -> List[str]:
|
||||
topics = set()
|
||||
for entry in entries:
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if isinstance(sense, dict):
|
||||
topics.update(sense_topics(sense, categories))
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def grammar_hints(entries: Sequence[Dict[str, object]]) -> List[str]:
|
||||
hints = set()
|
||||
for entry in entries:
|
||||
pos = str(entry.get("pos", "")).lower()
|
||||
tags = [str(tag).lower() for tag in entry.get("tags", []) or []]
|
||||
if pos == "verb" and "form-of" in tags:
|
||||
hints.add("voce_verbale")
|
||||
if pos == "noun":
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for gloss in sense.get("glosses", []) or []:
|
||||
gloss_text = str(gloss).lower()
|
||||
if "diminutivo" in gloss_text:
|
||||
hints.add("diminutivo")
|
||||
if "accrescitivo" in gloss_text:
|
||||
hints.add("accrescitivo")
|
||||
if "peggiorativo" in gloss_text:
|
||||
hints.add("peggiorativo")
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for gloss in sense.get("glosses", []) or []:
|
||||
gloss_text = str(gloss).lower()
|
||||
if "congiuntivo" in gloss_text:
|
||||
hints.add("congiuntivo")
|
||||
if "imperativo" in gloss_text:
|
||||
hints.add("imperativo")
|
||||
if "plurale" in gloss_text:
|
||||
hints.add("plurale")
|
||||
return sorted(hints)
|
||||
|
||||
|
||||
def simplify_entry(obj: Dict[str, object]) -> Dict[str, object]:
|
||||
categories = [normalize_text(item) for item in obj.get("categories", []) or [] if item]
|
||||
senses = []
|
||||
for sense in obj.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
glosses = [normalize_text(item) for item in sense.get("glosses", []) or [] if normalize_text(item)]
|
||||
if not glosses:
|
||||
continue
|
||||
senses.append(
|
||||
{
|
||||
"glosses": glosses,
|
||||
"examples": [
|
||||
normalize_text(example.get("text", ""))
|
||||
for example in sense.get("examples", []) or []
|
||||
if isinstance(example, dict) and normalize_text(example.get("text", ""))
|
||||
],
|
||||
"topics": sense_topics(sense, categories),
|
||||
"tags": [str(tag) for tag in sense.get("tags", []) or [] if tag],
|
||||
"categories": [normalize_text(item) for item in sense.get("categories", []) or [] if item],
|
||||
}
|
||||
)
|
||||
return {
|
||||
"word": obj.get("word"),
|
||||
"lang": obj.get("lang"),
|
||||
"lang_code": obj.get("lang_code"),
|
||||
"pos": map_pos(str(obj.get("pos", ""))),
|
||||
"pos_title": obj.get("pos_title"),
|
||||
"tags": [str(tag) for tag in obj.get("tags", []) or [] if tag],
|
||||
"categories": categories,
|
||||
"senses": senses,
|
||||
"synonyms": [item for item in obj.get("synonyms", []) or [] if isinstance(item, dict) and item.get("word")],
|
||||
"related": [item for item in obj.get("related", []) or [] if isinstance(item, dict) and item.get("word")],
|
||||
}
|
||||
|
||||
|
||||
def choose_best_entries(refined_entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[Dict[str, object]]:
|
||||
target_pos = str(refined_entry.get("pos", "")).upper()
|
||||
exact = [candidate for candidate in candidates if str(candidate.get("pos", "")).upper() == target_pos]
|
||||
if exact:
|
||||
return exact
|
||||
return list(candidates)
|
||||
|
||||
|
||||
def wiktextract_already_useful(entry: Dict[str, object]) -> bool:
|
||||
payload = entry.get("wiktextract", {})
|
||||
if not isinstance(payload, dict):
|
||||
return False
|
||||
status = str(payload.get("status", "")).lower()
|
||||
if status == "enriched" and (payload.get("definitions") or payload.get("entries")):
|
||||
return True
|
||||
if status in {"missing", "no_match"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def select_targets(
|
||||
refined_payload: Dict[str, object],
|
||||
review_payload: Dict[str, object],
|
||||
review_reasons: set[str],
|
||||
explicit_words: set[str],
|
||||
word_limit: int,
|
||||
skip_existing: bool,
|
||||
) -> Tuple[List[Dict[str, object]], int]:
|
||||
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
|
||||
|
||||
if explicit_words:
|
||||
selected = []
|
||||
skipped_existing_count = 0
|
||||
for word in explicit_words:
|
||||
entry = refined_by_word.get(word)
|
||||
if entry is None:
|
||||
continue
|
||||
if skip_existing and wiktextract_already_useful(entry):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
selected = selected[:word_limit] if word_limit > 0 else selected
|
||||
return selected, skipped_existing_count
|
||||
|
||||
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
selected_words: List[str] = []
|
||||
seen = set()
|
||||
skipped_existing_count = 0
|
||||
|
||||
for review_entry in review_entries:
|
||||
word = str(review_entry.get("form", "")).strip().lower()
|
||||
if not word or word in seen:
|
||||
continue
|
||||
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
|
||||
refined = refined_by_word.get(word)
|
||||
if refined is None:
|
||||
continue
|
||||
if skip_existing and wiktextract_already_useful(refined):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
|
||||
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
|
||||
selected_words.append(word)
|
||||
seen.add(word)
|
||||
if word_limit > 0 and len(selected_words) >= word_limit:
|
||||
break
|
||||
|
||||
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
|
||||
|
||||
|
||||
def wiktextract_payload_for_entry(refined_entry: Dict[str, object], matches: Sequence[Dict[str, object]]) -> Dict[str, object]:
|
||||
if not matches:
|
||||
return {
|
||||
"status": "missing",
|
||||
"matched": False,
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
}
|
||||
|
||||
selected_entries = choose_best_entries(refined_entry, matches)
|
||||
definitions = []
|
||||
for item in selected_entries:
|
||||
for sense in item.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
definitions.extend(sense.get("glosses", []) or [])
|
||||
definitions = [normalize_text(item) for item in definitions if normalize_text(item)]
|
||||
|
||||
all_categories = []
|
||||
for item in selected_entries:
|
||||
all_categories.extend(item.get("categories", []) or [])
|
||||
|
||||
return {
|
||||
"status": "enriched" if definitions else "entries_without_definitions",
|
||||
"matched": bool(definitions),
|
||||
"definitions": definitions,
|
||||
"entries": selected_entries,
|
||||
"topic_hints": word_level_topics(selected_entries, all_categories),
|
||||
"grammar_hints": grammar_hints(selected_entries),
|
||||
"categories": sorted(set(normalize_text(item) for item in all_categories if normalize_text(item))),
|
||||
}
|
||||
|
||||
|
||||
def enrich_from_wiktextract(args: argparse.Namespace) -> Dict[str, object]:
|
||||
refined_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
review_payload = load_json(args.review, {"entries": []})
|
||||
if not isinstance(review_payload, dict):
|
||||
review_payload = {"entries": []}
|
||||
|
||||
targets, skipped_existing_count = select_targets(
|
||||
refined_payload,
|
||||
review_payload,
|
||||
parse_csv_set(args.review_reasons),
|
||||
parse_csv_set(args.words),
|
||||
args.word_limit,
|
||||
args.skip_existing,
|
||||
)
|
||||
|
||||
print(
|
||||
f"Target selezionati: {len(targets)}"
|
||||
+ (f" | già saltati per wiktextract esistente: {skipped_existing_count}" if args.skip_existing else "")
|
||||
)
|
||||
|
||||
index = load_or_build_index(args.wiktextract, args.index_cache)
|
||||
refined_index = {
|
||||
entry_key(entry): deepcopy(entry)
|
||||
for entry in refined_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
|
||||
matched_count = 0
|
||||
missing_count = 0
|
||||
|
||||
for idx, entry in enumerate(targets, start=1):
|
||||
updated = deepcopy(entry)
|
||||
word = str(entry.get("form", "")).strip().lower()
|
||||
offsets = index.get(word, [])
|
||||
objects = [simplify_entry(obj) for obj in read_jsonl_objects_at_offsets(args.wiktextract, offsets)]
|
||||
payload = wiktextract_payload_for_entry(updated, objects)
|
||||
updated["wiktextract"] = payload
|
||||
updated["wiktextract_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
refined_index[entry_key(updated)] = updated
|
||||
|
||||
if payload.get("matched"):
|
||||
matched_count += 1
|
||||
else:
|
||||
missing_count += 1
|
||||
|
||||
print(
|
||||
f"[{idx}/{len(targets)}] {word}: "
|
||||
f"status={payload.get('status')} "
|
||||
f"def={len(payload.get('definitions', []))} "
|
||||
f"topics={len(payload.get('topic_hints', []))} "
|
||||
f"entries={len(payload.get('entries', []))}"
|
||||
)
|
||||
|
||||
merged_entries = list(refined_index.values())
|
||||
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
|
||||
|
||||
merged_payload = {
|
||||
"meta": {
|
||||
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
|
||||
"wiktextract_source": str(args.wiktextract),
|
||||
"wiktextract_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"wiktextract_target_count": len(targets),
|
||||
"wiktextract_skipped_existing_count": skipped_existing_count,
|
||||
"wiktextract_matched_count": matched_count,
|
||||
"wiktextract_missing_count": missing_count,
|
||||
},
|
||||
"entries": merged_entries,
|
||||
}
|
||||
|
||||
write_json(args.output, merged_payload)
|
||||
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"matched_count": matched_count,
|
||||
"missing_count": missing_count,
|
||||
"output": str(args.output),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = enrich_from_wiktextract(args)
|
||||
print(f"Lessico con Wiktextract generato: {result['output']}")
|
||||
print(f"Voci trattate: {result['target_count']}")
|
||||
print(f"Voci già saltate: {result['skipped_existing_count']}")
|
||||
print(f"Match Wiktextract: {result['matched_count']}")
|
||||
print(f"Senza match Wiktextract: {result['missing_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
678
enrich_review_from_wiktionary.py
Normal file
678
enrich_review_from_wiktionary.py
Normal file
@@ -0,0 +1,678 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
WIKTIONARY_CACHE_PATH = Path(__file__).with_name(".wiktionary_cache.json")
|
||||
WIKTIONARY_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktionary.json")
|
||||
WIKTIONARY_API_URL = "https://it.wiktionary.org/w/api.php"
|
||||
|
||||
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
|
||||
|
||||
POS_ALIASES = {
|
||||
"sostantivo": "NOUN",
|
||||
"nome": "NOUN",
|
||||
"sost": "NOUN",
|
||||
"aggettivo": "ADJ",
|
||||
"agg": "ADJ",
|
||||
"verbo": "VERB",
|
||||
"verb": "VERB",
|
||||
"verb form": "VERB_FORM",
|
||||
"avverbio": "ADV",
|
||||
"avv": "ADV",
|
||||
"preposizione": "PREP",
|
||||
"prep": "PREP",
|
||||
"congiunzione": "CONJ",
|
||||
"cong": "CONJ",
|
||||
"pronome": "PRON",
|
||||
"pron": "PRON",
|
||||
"articolo": "ART",
|
||||
"interiezione": "INTJ",
|
||||
"inter": "INTJ",
|
||||
"locuzione": "PHRASE",
|
||||
"loc": "PHRASE",
|
||||
}
|
||||
|
||||
TOPIC_KEYWORDS = {
|
||||
"religion": ("religione", "cattolic", "sacro", "sacra", "devozion", "scapolare", "abbazia", "monastero"),
|
||||
"clothing": ("abito", "vestito", "vestit", "abbigliamento", "indumento", "stoffa"),
|
||||
"grammar": ("diminutivo", "voce verbale", "congiuntivo", "plurale", "singolare", "grammatica", "verbo"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "citta", "localita", "frazione"),
|
||||
"transport": ("veicolo", "motore", "treno", "aereo", "trasporto", "nave", "imbarcazione"),
|
||||
"health": ("medicina", "ospedale", "malattia", "cura", "feriti", "ammalati", "sanitario"),
|
||||
}
|
||||
|
||||
GRAMMAR_KEYWORDS = (
|
||||
"diminutivo",
|
||||
"accrescitivo",
|
||||
"peggiorativo",
|
||||
"alterato",
|
||||
"voce verbale",
|
||||
"congiuntivo",
|
||||
"participio",
|
||||
"plurale",
|
||||
"singolare",
|
||||
"maschile",
|
||||
"femminile",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce le voci problematiche del lessico refined con definizioni e metadati "
|
||||
"estratti da it.wiktionary.org."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json da usare per selezionare le voci prioritarie.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=WIKTIONARY_OUTPUT_PATH,
|
||||
help="Nuovo lessico con blocco wiktionary aggiunto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache",
|
||||
type=Path,
|
||||
default=WIKTIONARY_CACHE_PATH,
|
||||
help="Cache locale delle risposte Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Pausa tra le richieste HTTP a Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-every",
|
||||
type=int,
|
||||
default=25,
|
||||
help="Salva cache e output ogni N parole elaborate per non perdere progresso.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-429",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Numero massimo di tentativi aggiuntivi se Wiktionary risponde HTTP 429.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backoff-429",
|
||||
type=float,
|
||||
default=30.0,
|
||||
help="Secondi di attesa iniziali dopo un HTTP 429; raddoppiano a ogni nuovo tentativo.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-on-429",
|
||||
action="store_true",
|
||||
help="Se attivo, al primo HTTP 429 salva lo stato e interrompe il batch senza altri tentativi.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
default="",
|
||||
help="Lista separata da virgole di lemmi specifici da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-reasons",
|
||||
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
|
||||
help="Motivi del file review da trattare con priorita, separati da virgole.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
default=WIKTIONARY_API_URL,
|
||||
help="Endpoint MediaWiki Action API di Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci che nel lessico di input hanno già un blocco wiktionary con stato utile.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_csv_set(value: str) -> set[str]:
|
||||
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def fetch_wikitext(title: str, api_url: str) -> Dict[str, object]:
|
||||
params = {
|
||||
"action": "query",
|
||||
"prop": "revisions",
|
||||
"titles": title,
|
||||
"rvprop": "content",
|
||||
"rvslots": "main",
|
||||
"formatversion": "2",
|
||||
"format": "json",
|
||||
}
|
||||
url = f"{api_url}?{urllib.parse.urlencode(params)}"
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "cruciverba-alpha/0.1 (local lexical enrichment)",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
pages = ((payload.get("query") or {}).get("pages") or [])
|
||||
if not pages:
|
||||
return {"status": "missing"}
|
||||
page = pages[0]
|
||||
if page.get("missing"):
|
||||
return {"status": "missing", "title": page.get("title", title)}
|
||||
revisions = page.get("revisions") or []
|
||||
content = ""
|
||||
if revisions:
|
||||
slots = revisions[0].get("slots") or {}
|
||||
main_slot = slots.get("main") or {}
|
||||
content = str(main_slot.get("content") or "")
|
||||
return {
|
||||
"status": "ok" if content else "empty",
|
||||
"title": page.get("title", title),
|
||||
"pageid": page.get("pageid"),
|
||||
"wikitext": content,
|
||||
}
|
||||
|
||||
|
||||
def fetch_wikitext_with_retry(title: str, args: argparse.Namespace) -> Dict[str, object]:
|
||||
attempts = 0
|
||||
delay = max(1.0, float(args.backoff_429))
|
||||
while True:
|
||||
try:
|
||||
return fetch_wikitext(title, args.api_url)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code != 429:
|
||||
raise
|
||||
if args.stop_on_429:
|
||||
raise
|
||||
if attempts >= max(0, int(args.retry_429)):
|
||||
raise
|
||||
attempts += 1
|
||||
print(f"[429] {title}: attendo {delay:.1f}s prima del tentativo {attempts}/{args.retry_429}")
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
|
||||
|
||||
def normalize_heading(text: str) -> str:
|
||||
raw = str(text or "").strip().lower().replace(" ", "")
|
||||
if raw == "{{-it-}}":
|
||||
return "{{-it-}}"
|
||||
cleaned = strip_wikicode(text).strip().lower()
|
||||
return cleaned
|
||||
|
||||
|
||||
def extract_italian_section(wikitext: str) -> str:
|
||||
section_pattern = re.compile(r"^==\s*(.*?)\s*==\s*$", re.MULTILINE)
|
||||
matches = list(section_pattern.finditer(wikitext))
|
||||
for index, match in enumerate(matches):
|
||||
raw_heading = str(match.group(1) or "").strip().lower().replace(" ", "")
|
||||
heading = normalize_heading(match.group(1))
|
||||
if raw_heading == "{{-it-}}" or heading in {"italiano", "it"}:
|
||||
start = match.end()
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(wikitext)
|
||||
return wikitext[start:end]
|
||||
return ""
|
||||
|
||||
|
||||
def strip_templates(text: str) -> str:
|
||||
previous = None
|
||||
current = text
|
||||
while previous != current:
|
||||
previous = current
|
||||
current = re.sub(r"\{\{([^{}|]+)\|([^{}]+?)\}\}", r"\2", current)
|
||||
current = re.sub(r"\{\{[^{}]+\}\}", "", current)
|
||||
return current
|
||||
|
||||
|
||||
def strip_wikicode(text: str) -> str:
|
||||
value = str(text or "")
|
||||
value = re.sub(r"<!--.*?-->", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<ref[^>]*>.*?</ref>", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<[^>]+>", " ", value)
|
||||
value = strip_templates(value)
|
||||
value = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", value)
|
||||
value = re.sub(r"\[\[([^\]]+)\]\]", r"\1", value)
|
||||
value = value.replace("'''", "").replace("''", "")
|
||||
value = value.replace(" ", " ")
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value.strip(" .;:-")
|
||||
|
||||
|
||||
def infer_topics(definitions: Sequence[str], categories: Sequence[str]) -> List[str]:
|
||||
text = " ".join(definitions + list(categories)).lower()
|
||||
topics = []
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
if any(keyword in text for keyword in keywords):
|
||||
topics.append(topic)
|
||||
return sorted(set(topics))
|
||||
|
||||
|
||||
def infer_grammar_hints(definitions: Sequence[str], raw_section: str) -> List[str]:
|
||||
text = f"{' '.join(definitions)} {raw_section}".lower()
|
||||
hints = []
|
||||
for keyword in GRAMMAR_KEYWORDS:
|
||||
if keyword in text:
|
||||
hints.append(keyword)
|
||||
return sorted(set(hints))
|
||||
|
||||
|
||||
def detect_pos_from_heading(heading: str) -> Optional[str]:
|
||||
normalized = normalize_heading(heading)
|
||||
if not normalized:
|
||||
return None
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if label in normalized:
|
||||
return pos
|
||||
return None
|
||||
|
||||
|
||||
def parse_template_marker(line: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
stripped = line.strip()
|
||||
match = re.match(r"^\{\{-([^{}|]+?)-?(?:\|.*)?\}\}$", stripped, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
return None, None
|
||||
marker = match.group(1).strip().lower()
|
||||
if marker == "it":
|
||||
return "language", "it"
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if marker.startswith(label):
|
||||
return "pos", pos
|
||||
if marker.startswith("sinon"):
|
||||
return "subsection", "sinonimi"
|
||||
if marker.startswith(("etim", "trad", "sill", "pron", "var", "note")):
|
||||
return "subsection", marker
|
||||
return "subsection", marker
|
||||
|
||||
|
||||
def parse_wiktionary_section(section_text: str) -> Dict[str, object]:
|
||||
lines = section_text.splitlines()
|
||||
entries: List[Dict[str, object]] = []
|
||||
categories: List[str] = []
|
||||
current: Optional[Dict[str, object]] = None
|
||||
current_subsection = ""
|
||||
|
||||
heading_pattern = re.compile(r"^(={3,4})\s*(.*?)\s*\1\s*$")
|
||||
|
||||
for raw_line in lines:
|
||||
line = raw_line.rstrip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
for category_match in re.findall(r"\[\[Categoria:([^\]]+)\]\]", line):
|
||||
categories.append(strip_wikicode(category_match))
|
||||
|
||||
marker_kind, marker_value = parse_template_marker(line)
|
||||
if marker_kind == "pos":
|
||||
current = {
|
||||
"pos": marker_value,
|
||||
"heading": marker_value,
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
if marker_kind == "subsection":
|
||||
current_subsection = str(marker_value or "")
|
||||
continue
|
||||
|
||||
heading_match = heading_pattern.match(line)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
heading = heading_match.group(2)
|
||||
if level == 3:
|
||||
pos = detect_pos_from_heading(heading)
|
||||
if pos:
|
||||
current = {
|
||||
"pos": pos,
|
||||
"heading": strip_wikicode(heading),
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
current_subsection = normalize_heading(heading)
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
continue
|
||||
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("#") and not stripped.startswith(("#:", "#*", "#;")):
|
||||
definition = strip_wikicode(stripped.lstrip("#").strip())
|
||||
if definition:
|
||||
current["definitions"].append(definition)
|
||||
continue
|
||||
|
||||
if stripped.startswith("#:") or stripped.startswith("#*"):
|
||||
example = strip_wikicode(stripped[2:].strip())
|
||||
if example:
|
||||
current["examples"].append(example)
|
||||
continue
|
||||
|
||||
if current_subsection.startswith("sinonim") and stripped.startswith("*"):
|
||||
synonym = strip_wikicode(stripped.lstrip("*").strip())
|
||||
if synonym:
|
||||
current["synonyms"].append(synonym)
|
||||
|
||||
flat_definitions = [definition for entry in entries for definition in entry["definitions"]]
|
||||
topic_hints = infer_topics(flat_definitions, categories)
|
||||
grammar_hints = infer_grammar_hints(flat_definitions, section_text)
|
||||
|
||||
return {
|
||||
"entries": entries,
|
||||
"categories": sorted(set(filter(None, categories))),
|
||||
"definitions": flat_definitions,
|
||||
"topic_hints": topic_hints,
|
||||
"grammar_hints": grammar_hints,
|
||||
}
|
||||
|
||||
|
||||
def wiktionary_payload_for_entry(entry: Dict[str, object], api_response: Dict[str, object]) -> Dict[str, object]:
|
||||
status = str(api_response.get("status", "missing"))
|
||||
if status != "ok":
|
||||
return {
|
||||
"status": status,
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
italian_section = extract_italian_section(str(api_response.get("wikitext") or ""))
|
||||
if not italian_section:
|
||||
return {
|
||||
"status": "no_italian_section",
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
parsed = parse_wiktionary_section(italian_section)
|
||||
matched = bool(parsed["definitions"])
|
||||
|
||||
return {
|
||||
"status": "enriched" if matched else "section_without_definitions",
|
||||
"matched": matched,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"pageid": api_response.get("pageid"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": parsed["definitions"],
|
||||
"entries": parsed["entries"],
|
||||
"topic_hints": parsed["topic_hints"],
|
||||
"grammar_hints": parsed["grammar_hints"],
|
||||
"categories": parsed["categories"],
|
||||
"raw_excerpt": italian_section[:4000],
|
||||
}
|
||||
|
||||
|
||||
def select_targets(
|
||||
refined_payload: Dict[str, object],
|
||||
review_payload: Dict[str, object],
|
||||
review_reasons: set[str],
|
||||
explicit_words: set[str],
|
||||
word_limit: int,
|
||||
skip_existing: bool,
|
||||
) -> Tuple[List[Dict[str, object]], int]:
|
||||
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
|
||||
|
||||
if explicit_words:
|
||||
selected = []
|
||||
skipped_existing_count = 0
|
||||
for word in explicit_words:
|
||||
entry = refined_by_word.get(word)
|
||||
if entry is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(entry):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
selected = selected[:word_limit] if word_limit > 0 else selected
|
||||
return selected, skipped_existing_count
|
||||
|
||||
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
selected_words: List[str] = []
|
||||
seen = set()
|
||||
skipped_existing_count = 0
|
||||
|
||||
for review_entry in review_entries:
|
||||
word = str(review_entry.get("form", "")).strip().lower()
|
||||
if not word or word in seen:
|
||||
continue
|
||||
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
|
||||
refined = refined_by_word.get(word)
|
||||
if refined is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(refined):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
|
||||
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
|
||||
selected_words.append(word)
|
||||
seen.add(word)
|
||||
if word_limit > 0 and len(selected_words) >= word_limit:
|
||||
break
|
||||
|
||||
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
|
||||
|
||||
|
||||
def wiktionary_already_useful(entry: Dict[str, object]) -> bool:
|
||||
wiktionary = entry.get("wiktionary", {})
|
||||
if not isinstance(wiktionary, dict):
|
||||
return False
|
||||
status = str(wiktionary.get("status", "")).lower()
|
||||
if status == "enriched" and (wiktionary.get("definitions") or wiktionary.get("entries")):
|
||||
return True
|
||||
if status in {"missing", "no_italian_section", "section_without_definitions", "empty"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def enrich_from_wiktionary(args: argparse.Namespace) -> Dict[str, object]:
|
||||
refined_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
review_payload = load_json(args.review, {"entries": []})
|
||||
if not isinstance(review_payload, dict):
|
||||
review_payload = {"entries": []}
|
||||
|
||||
cache = load_json(args.cache, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
|
||||
targets, skipped_existing_count = select_targets(
|
||||
refined_payload,
|
||||
review_payload,
|
||||
parse_csv_set(args.review_reasons),
|
||||
parse_csv_set(args.words),
|
||||
args.word_limit,
|
||||
args.skip_existing,
|
||||
)
|
||||
|
||||
enriched_entries = []
|
||||
cache_hits = 0
|
||||
network_calls = 0
|
||||
network_attempts = 0
|
||||
processed_count = 0
|
||||
stopped_reason = None
|
||||
stop_word = None
|
||||
|
||||
print(
|
||||
f"Target selezionati: {len(targets)}"
|
||||
+ (f" | già saltati per wiktionary esistente: {skipped_existing_count}" if args.skip_existing else "")
|
||||
)
|
||||
|
||||
def persist_progress() -> None:
|
||||
refined_index = {
|
||||
entry_key(entry): entry
|
||||
for entry in refined_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
for item in enriched_entries:
|
||||
refined_index[entry_key(item)] = item
|
||||
|
||||
merged_entries = list(refined_index.values())
|
||||
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
|
||||
|
||||
merged_payload = {
|
||||
"meta": {
|
||||
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
|
||||
"wiktionary_source": args.api_url,
|
||||
"wiktionary_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"wiktionary_target_count": len(targets),
|
||||
"wiktionary_processed_count": processed_count,
|
||||
"wiktionary_skipped_existing_count": skipped_existing_count,
|
||||
"wiktionary_cache_hits": cache_hits,
|
||||
"wiktionary_network_calls": network_calls,
|
||||
"wiktionary_network_attempts": network_attempts,
|
||||
"wiktionary_stopped_reason": stopped_reason,
|
||||
"wiktionary_stop_word": stop_word,
|
||||
},
|
||||
"entries": merged_entries,
|
||||
}
|
||||
|
||||
write_json(args.cache, cache)
|
||||
write_json(args.output, merged_payload)
|
||||
|
||||
for index, entry in enumerate(targets, start=1):
|
||||
updated = deepcopy(entry)
|
||||
word = str(entry.get("form", "")).strip()
|
||||
cache_key = word.lower()
|
||||
|
||||
if cache_key in cache:
|
||||
api_response = cache[cache_key]
|
||||
cache_hits += 1
|
||||
else:
|
||||
try:
|
||||
network_attempts += 1
|
||||
api_response = fetch_wikitext_with_retry(word, args)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 429:
|
||||
stop_word = word
|
||||
stopped_reason = f"http_429_after_{processed_count}_words"
|
||||
print(f"[STOP] Wiktionary ha risposto 429 su '{word}'. Salvo il progresso e interrompo il batch.")
|
||||
persist_progress()
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
raise
|
||||
cache[cache_key] = api_response
|
||||
network_calls += 1
|
||||
if args.sleep > 0:
|
||||
time.sleep(args.sleep)
|
||||
|
||||
updated["wiktionary"] = wiktionary_payload_for_entry(updated, api_response)
|
||||
updated["wiktionary_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
enriched_entries.append(updated)
|
||||
processed_count += 1
|
||||
print(
|
||||
f"[{index}/{len(targets)}] {word}: "
|
||||
f"status={updated['wiktionary'].get('status')} "
|
||||
f"def={len(updated['wiktionary'].get('definitions', []))} "
|
||||
f"topics={len(updated['wiktionary'].get('topic_hints', []))}"
|
||||
)
|
||||
if args.save_every > 0 and processed_count % int(args.save_every) == 0:
|
||||
persist_progress()
|
||||
print(f"[save] progresso salvato dopo {processed_count} parole")
|
||||
|
||||
persist_progress()
|
||||
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = enrich_from_wiktionary(args)
|
||||
print(f"Lessico con Wiktionary generato: {result['output']}")
|
||||
print(f"Voci trattate: {result.get('processed_count', result['target_count'])}/{result['target_count']}")
|
||||
if "skipped_existing_count" in result:
|
||||
print(f"Voci già saltate: {result['skipped_existing_count']}")
|
||||
print(f"Cache hit: {result['cache_hits']}")
|
||||
print(f"Chiamate rete: {result['network_calls']}")
|
||||
if "network_attempts" in result:
|
||||
print(f"Tentativi di rete: {result['network_attempts']}")
|
||||
if result.get("stopped_reason"):
|
||||
print(f"Batch interrotto: {result['stopped_reason']}")
|
||||
if result.get("stop_word"):
|
||||
print(f"Ultima parola bloccante: {result['stop_word']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
475
main.py
475
main.py
@@ -2,9 +2,14 @@ from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, List
|
||||
|
||||
from build_babelnet_enrichment import BABELNET_ENV_KEY, BABELNET_OUTPUT_PATH, BABELNET_LOCAL_KEY_PATH, load_babelnet_api_key
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
from build_vocabulary import (
|
||||
FILTERED_OUTPUT_PATH,
|
||||
METADATA_OUTPUT_PATH,
|
||||
@@ -13,6 +18,7 @@ from build_vocabulary import (
|
||||
)
|
||||
from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
|
||||
from clue_generator import generate_clues, load_enriched_entries
|
||||
from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
|
||||
from crossword_generator import CrosswordGenerator, WORDS, render_grid
|
||||
|
||||
@@ -26,6 +32,14 @@ DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
|
||||
DEFAULT_TOPIC = "general"
|
||||
DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
|
||||
DEFAULT_RUNTIME_LEXICON_CANDIDATES = (
|
||||
"lexicon_it_curated_llm_aggressive.json",
|
||||
"lexicon_it_curated_llm.json",
|
||||
"lexicon_it_curated.json",
|
||||
"lexicon_it_refined_plus_wiktextract.json",
|
||||
ENRICHED_LEXICON_OUTPUT_PATH.name,
|
||||
SEMANTIC_LEXICON_OUTPUT_PATH.name,
|
||||
)
|
||||
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
|
||||
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
|
||||
GENERAL_FILL_MIN_QUALITY = 6
|
||||
@@ -92,6 +106,8 @@ TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
|
||||
),
|
||||
}
|
||||
|
||||
ACTIVE_LEXICON_PATH: Path | None = None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
|
||||
@@ -115,6 +131,23 @@ def parse_args() -> argparse.Namespace:
|
||||
action="store_true",
|
||||
help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet-enrich",
|
||||
action="store_true",
|
||||
help="Prima di generare il cruciverba arricchisce incrementalmente il lessico con BabelNet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet-limit",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Numero massimo di parole da interrogare su BabelNet in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet-sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa in secondi tra richieste BabelNet consecutive.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--vocabulary",
|
||||
type=Path,
|
||||
@@ -159,7 +192,13 @@ def parse_args() -> argparse.Namespace:
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=DEFAULT_TOPIC,
|
||||
help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
|
||||
help="Tema del cruciverba. Puoi indicare un topic o una lista separata da virgole, es. transport,nature,ecology. Se lasci general, i topic possono essere scelti dal lessico con --max-topics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-topics",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Numero massimo di topic casuali da scegliere dal lessico arricchito quando --topic e' general. Massimo consigliato: 3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--initial-word-count",
|
||||
@@ -173,6 +212,26 @@ def parse_args() -> argparse.Namespace:
|
||||
default=DEFAULT_THEMED_FILL_WORD_COUNT,
|
||||
help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--definitions",
|
||||
action="store_true",
|
||||
help="Genera e stampa le definizioni per le parole inserite nel cruciverba.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--lexicon",
|
||||
type=Path,
|
||||
default=None,
|
||||
help=(
|
||||
"File lessicale da usare durante l'esecuzione. Se omesso, il programma usa il lessico "
|
||||
"piu avanzato disponibile, preferendo lexicon_it_curated_llm_aggressive.json."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--definition-babelnet-limit",
|
||||
type=int,
|
||||
default=20,
|
||||
help="Numero massimo di parole del cruciverba da arricchire al volo con BabelNet per generare definizioni.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@@ -222,6 +281,220 @@ def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
|
||||
print(f"- match semantici: {matched}")
|
||||
|
||||
|
||||
def ensure_babelnet_enrichment(args: argparse.Namespace) -> None:
|
||||
if not args.babelnet_enrich:
|
||||
return
|
||||
if args.babelnet_limit <= 0:
|
||||
print("BabelNet enrichment saltato: --babelnet-limit <= 0")
|
||||
return
|
||||
|
||||
from babelnet_incremental_enricher import run_incremental_enrichment
|
||||
|
||||
namespace = SimpleNamespace(
|
||||
api_key=load_babelnet_api_key(),
|
||||
topic=primary_topic(args.topic),
|
||||
difficulty=args.difficulty,
|
||||
limit=args.babelnet_limit,
|
||||
sleep=args.babelnet_sleep,
|
||||
semantic=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
babelnet=BABELNET_OUTPUT_PATH,
|
||||
enriched=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
dry_run=False,
|
||||
retry_no_match=False,
|
||||
)
|
||||
|
||||
print("Arricchimento BabelNet incrementale")
|
||||
print(f"- tema guida: {primary_topic(args.topic)}")
|
||||
print(f"- topic attivi: {args.topic}")
|
||||
print(f"- limite parole: {args.babelnet_limit}")
|
||||
print(f"- chiave: {BABELNET_ENV_KEY} oppure {BABELNET_LOCAL_KEY_PATH.name}")
|
||||
result = run_incremental_enrichment(namespace)
|
||||
print("Riepilogo BabelNet")
|
||||
print(f"- parole interrogate: {result['selected_count']}")
|
||||
print(f"- chiamate API reali: {result['api_call_count']}")
|
||||
print(f"- risposte da cache: {result['cache_hit_count']}")
|
||||
print(f"- match: {result['matched_count']}")
|
||||
for item in result["word_logs"]:
|
||||
print(
|
||||
f" {item['word']}: api_calls={item['api_calls']}, "
|
||||
f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, "
|
||||
f"match={item['matched']}, synsets={item['synsets']}"
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
def enrich_words_for_definitions(args: argparse.Namespace, words: List[str]) -> None:
|
||||
if not args.definitions:
|
||||
return
|
||||
if args.definition_babelnet_limit <= 0:
|
||||
print("Arricchimento BabelNet per definizioni saltato: --definition-babelnet-limit <= 0")
|
||||
return
|
||||
|
||||
from babelnet_incremental_enricher import run_incremental_enrichment
|
||||
|
||||
namespace = SimpleNamespace(
|
||||
api_key=load_babelnet_api_key(),
|
||||
topic=primary_topic(args.topic),
|
||||
difficulty=args.difficulty,
|
||||
limit=args.definition_babelnet_limit,
|
||||
sleep=args.babelnet_sleep,
|
||||
semantic=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
babelnet=BABELNET_OUTPUT_PATH,
|
||||
enriched=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
dry_run=False,
|
||||
retry_no_match=False,
|
||||
words=words,
|
||||
)
|
||||
|
||||
print()
|
||||
print("Arricchimento BabelNet per definizioni")
|
||||
print(f"- parole nel cruciverba: {len(set(words))}")
|
||||
print(f"- limite parole: {args.definition_babelnet_limit}")
|
||||
result = run_incremental_enrichment(namespace)
|
||||
print("Riepilogo BabelNet definizioni")
|
||||
print(f"- parole interrogate: {result['selected_count']}")
|
||||
print(f"- chiamate API reali: {result['api_call_count']}")
|
||||
print(f"- risposte da cache: {result['cache_hit_count']}")
|
||||
print(f"- match: {result['matched_count']}")
|
||||
for item in result["word_logs"]:
|
||||
print(
|
||||
f" {item['word']}: api_calls={item['api_calls']}, "
|
||||
f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, "
|
||||
f"match={item['matched']}, synsets={item['synsets']}"
|
||||
)
|
||||
|
||||
|
||||
def placement_words(placements) -> List[str]:
|
||||
return [placement.word for placement in placements]
|
||||
|
||||
|
||||
def print_definitions(args: argparse.Namespace, state) -> None:
|
||||
if not args.definitions:
|
||||
return
|
||||
entries = load_enriched_entries(resolve_runtime_lexicon_path(args.lexicon))
|
||||
clues = generate_clues(state.placements, entries, primary_topic(args.topic), args.difficulty)
|
||||
print()
|
||||
print("Definizioni:")
|
||||
for clue in clues:
|
||||
print(
|
||||
f"{clue.number:>2}. {clue.direction} ({clue.x}, {clue.y}) "
|
||||
f"[{clue.source}] {clue.text} -> {clue.word.upper()}"
|
||||
)
|
||||
print_alpha_diagnostics(args, state, entries)
|
||||
|
||||
|
||||
def word_is_on_topic(entry: Dict[str, object], topic: str) -> bool:
|
||||
active_topics = parse_topics(topic)
|
||||
if len(active_topics) > 1:
|
||||
return any(word_is_on_topic(entry, item) for item in active_topics)
|
||||
|
||||
normalized_topic = active_topics[0]
|
||||
if normalized_topic == DEFAULT_TOPIC:
|
||||
return True
|
||||
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) if item}
|
||||
if normalized_topic in topics:
|
||||
return True
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item}
|
||||
if normalized_topic in semantic_topics:
|
||||
return True
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
try:
|
||||
topic_score = int(best_synset.get("topic_score", 0))
|
||||
except (TypeError, ValueError):
|
||||
topic_score = 0
|
||||
if best_synset.get("topic") == normalized_topic and topic_score >= 40:
|
||||
return True
|
||||
|
||||
try:
|
||||
return strong_topic_relevance(entry, normalized_topic) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def pos_label(pos: str) -> str:
|
||||
labels = {
|
||||
"NOUN": "sostantivi",
|
||||
"ADJ": "aggettivi",
|
||||
"VERB": "verbi",
|
||||
"ADV": "avverbi",
|
||||
"PREP": "preposizioni",
|
||||
"CONJ": "congiunzioni",
|
||||
}
|
||||
return labels.get(str(pos).upper(), "altri")
|
||||
|
||||
|
||||
def print_alpha_diagnostics(args: argparse.Namespace, state, entries: Dict[str, Dict[str, object]]) -> None:
|
||||
words = placement_words(state.placements)
|
||||
unique_words = list(dict.fromkeys(word.lower() for word in words))
|
||||
active_topics = parse_topics(args.topic)
|
||||
total_cells = state.area()
|
||||
filled_cells = len(state.grid)
|
||||
empty_cells = total_cells - filled_cells
|
||||
empty_ratio = empty_cells / total_cells if total_cells else 0.0
|
||||
filled_ratio = filled_cells / total_cells if total_cells else 0.0
|
||||
target_empty_cells = round(total_cells * args.target_empty_ratio)
|
||||
target_delta = empty_cells - target_empty_cells
|
||||
topic_words = []
|
||||
off_topic_words = []
|
||||
topic_distribution = {topic: 0 for topic in active_topics if topic != DEFAULT_TOPIC}
|
||||
pos_counts = {
|
||||
"sostantivi": 0,
|
||||
"aggettivi": 0,
|
||||
"verbi": 0,
|
||||
"avverbi": 0,
|
||||
"preposizioni": 0,
|
||||
"congiunzioni": 0,
|
||||
"altri": 0,
|
||||
}
|
||||
|
||||
for word in unique_words:
|
||||
entry = entries.get(word, {})
|
||||
label = pos_label(str(entry.get("pos", "")))
|
||||
pos_counts[label] = pos_counts.get(label, 0) + 1
|
||||
if entry and word_is_on_topic(entry, args.topic):
|
||||
topic_words.append(word)
|
||||
for selected_topic in topic_distribution:
|
||||
if word_is_on_topic(entry, selected_topic):
|
||||
topic_distribution[selected_topic] += 1
|
||||
else:
|
||||
off_topic_words.append(word)
|
||||
|
||||
print()
|
||||
print("Diagnostica alpha:")
|
||||
print(f"- parole uniche nello schema: {len(unique_words)}")
|
||||
print(f"- celle totali: {total_cells}")
|
||||
print(f"- celle riempite: {filled_cells} ({filled_ratio * 100:.1f}%)")
|
||||
print(f"- celle vuote: {empty_cells} ({empty_ratio * 100:.1f}%)")
|
||||
print(f"- target celle vuote: {target_empty_cells} ({args.target_empty_ratio * 100:.1f}%)")
|
||||
if target_delta > 0:
|
||||
print(f"- distanza dal target: {target_delta} celle vuote in piu del target")
|
||||
elif target_delta < 0:
|
||||
print(f"- distanza dal target: {-target_delta} celle vuote in meno del target")
|
||||
else:
|
||||
print("- distanza dal target: centrato")
|
||||
print(f"- topic richiesti: {', '.join(active_topics)}")
|
||||
print(f"- parole in tema: {len(topic_words)}")
|
||||
print(f"- parole fuori tema o non classificate: {len(off_topic_words)}")
|
||||
if topic_distribution:
|
||||
print("- distribuzione topic:")
|
||||
for selected_topic, count in topic_distribution.items():
|
||||
print(f" {selected_topic}: {count}")
|
||||
if topic_words:
|
||||
print(f"- elenco in tema: {', '.join(topic_words)}")
|
||||
if off_topic_words:
|
||||
print(f"- elenco fuori tema/non classificate: {', '.join(off_topic_words)}")
|
||||
print("- parti del discorso:")
|
||||
for label in ("sostantivi", "aggettivi", "verbi", "avverbi", "preposizioni", "congiunzioni", "altri"):
|
||||
print(f" {label}: {pos_counts.get(label, 0)}")
|
||||
|
||||
|
||||
def parse_difficulty(value: str) -> int:
|
||||
text = str(value).strip().lower()
|
||||
if text in DIFFICULTY_ALIASES:
|
||||
@@ -243,7 +516,30 @@ def load_selected_vocabulary(path: Path | None) -> List[str]:
|
||||
return path.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
|
||||
def load_semantic_payload() -> Dict[str, object]:
|
||||
def resolve_runtime_lexicon_path(requested: Path | None) -> Path:
|
||||
global ACTIVE_LEXICON_PATH
|
||||
if requested is not None:
|
||||
path = requested if requested.is_absolute() else Path(__file__).resolve().parent / requested
|
||||
if not path.exists():
|
||||
raise SystemExit(f"Il lessico specificato con --lexicon non esiste: {path}")
|
||||
ACTIVE_LEXICON_PATH = path
|
||||
return path
|
||||
if ACTIVE_LEXICON_PATH is not None:
|
||||
return ACTIVE_LEXICON_PATH
|
||||
base_dir = Path(__file__).resolve().parent
|
||||
for candidate in DEFAULT_RUNTIME_LEXICON_CANDIDATES:
|
||||
path = base_dir / candidate
|
||||
if path.exists():
|
||||
ACTIVE_LEXICON_PATH = path
|
||||
return path
|
||||
ACTIVE_LEXICON_PATH = ENRICHED_LEXICON_OUTPUT_PATH
|
||||
return ACTIVE_LEXICON_PATH
|
||||
|
||||
|
||||
def load_semantic_payload(path: Path | None = None) -> Dict[str, object]:
|
||||
runtime_path = resolve_runtime_lexicon_path(path)
|
||||
if runtime_path.exists():
|
||||
return json.loads(runtime_path.read_text(encoding="utf-8"))
|
||||
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
|
||||
lexicon = build_semantic_lexicon()
|
||||
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||
@@ -253,6 +549,74 @@ def load_semantic_payload() -> Dict[str, object]:
|
||||
return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def parse_topics(value: str) -> List[str]:
|
||||
topics = []
|
||||
seen = set()
|
||||
for raw_topic in str(value or DEFAULT_TOPIC).split(","):
|
||||
topic = raw_topic.strip().lower()
|
||||
if not topic or topic in seen:
|
||||
continue
|
||||
topics.append(topic)
|
||||
seen.add(topic)
|
||||
return topics or [DEFAULT_TOPIC]
|
||||
|
||||
|
||||
def primary_topic(value: str) -> str:
|
||||
return parse_topics(value)[0]
|
||||
|
||||
|
||||
def available_topics_from_lexicon(payload: Dict[str, object], *, min_words: int = 5) -> List[str]:
|
||||
counts: Dict[str, int] = {}
|
||||
excluded = {DEFAULT_TOPIC, "abstract", "actions"}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if not entry.get("allowed_in_crossword", False):
|
||||
continue
|
||||
for topic in entry.get("topics", []) or []:
|
||||
normalized = str(topic).strip().lower()
|
||||
if not normalized or normalized in excluded:
|
||||
continue
|
||||
counts[normalized] = counts.get(normalized, 0) + 1
|
||||
return sorted(topic for topic, count in counts.items() if count >= min_words)
|
||||
|
||||
|
||||
def resolve_topics(args: argparse.Namespace, difficulty_level: int) -> List[str]:
|
||||
requested = parse_topics(args.topic)
|
||||
max_topics = max(1, min(3, int(args.max_topics)))
|
||||
if requested != [DEFAULT_TOPIC]:
|
||||
selected = requested[:max_topics]
|
||||
args.topic = ",".join(selected)
|
||||
args.topic_seed_counts = {
|
||||
topic: len(select_initial_words(difficulty_level, topic, args.initial_word_count))
|
||||
for topic in selected
|
||||
}
|
||||
return selected
|
||||
|
||||
if max_topics <= 1:
|
||||
args.topic = DEFAULT_TOPIC
|
||||
args.topic_seed_counts = {}
|
||||
return [DEFAULT_TOPIC]
|
||||
|
||||
candidates = []
|
||||
for candidate in available_topics_from_lexicon(load_semantic_payload(), min_words=1):
|
||||
available = len(select_initial_words(difficulty_level, candidate, args.initial_word_count))
|
||||
if available > 0:
|
||||
candidates.append((candidate, available))
|
||||
if not candidates:
|
||||
args.topic = DEFAULT_TOPIC
|
||||
args.topic_seed_counts = {}
|
||||
return [DEFAULT_TOPIC]
|
||||
|
||||
rng = random.Random(args.seed)
|
||||
rng.shuffle(candidates)
|
||||
selected_pairs = candidates[: min(max_topics, len(candidates))]
|
||||
selected = [topic for topic, _ in selected_pairs]
|
||||
args.topic = ",".join(selected)
|
||||
args.topic_seed_counts = dict(selected_pairs)
|
||||
return selected
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", [])}
|
||||
semantic_topics = {
|
||||
@@ -271,6 +635,10 @@ def matches_topic_roots(word: str, selected_topic: str) -> bool:
|
||||
|
||||
|
||||
def topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||
active_topics = parse_topics(topic)
|
||||
if len(active_topics) > 1:
|
||||
return max(topic_relevance(entry, item) for item in active_topics)
|
||||
|
||||
selected_topic = topic.strip().lower()
|
||||
if selected_topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
@@ -295,6 +663,10 @@ def topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||
|
||||
|
||||
def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
|
||||
active_topics = parse_topics(topic)
|
||||
if len(active_topics) > 1:
|
||||
return max(strong_topic_relevance(entry, item) for item in active_topics)
|
||||
|
||||
selected_topic = topic.strip().lower()
|
||||
if selected_topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
@@ -341,7 +713,7 @@ def is_general_fill_support(entry: Dict[str, object]) -> bool:
|
||||
|
||||
def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
|
||||
payload = load_semantic_payload()
|
||||
normalized_topic = topic.strip().lower()
|
||||
normalized_topic = ",".join(parse_topics(topic))
|
||||
|
||||
eligible = [
|
||||
entry
|
||||
@@ -400,6 +772,40 @@ def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[
|
||||
|
||||
|
||||
def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
active_topics = parse_topics(topic)
|
||||
if len(active_topics) > 1:
|
||||
topic_pools = {
|
||||
selected_topic: select_initial_words(level, selected_topic, count)
|
||||
for selected_topic in active_topics
|
||||
}
|
||||
selected: List[str] = []
|
||||
indexes = {selected_topic: 0 for selected_topic in active_topics}
|
||||
|
||||
while len(selected) < count:
|
||||
progressed = False
|
||||
for selected_topic in active_topics:
|
||||
pool = topic_pools.get(selected_topic, [])
|
||||
while indexes[selected_topic] < len(pool) and pool[indexes[selected_topic]] in selected:
|
||||
indexes[selected_topic] += 1
|
||||
if indexes[selected_topic] >= len(pool):
|
||||
continue
|
||||
selected.append(pool[indexes[selected_topic]])
|
||||
indexes[selected_topic] += 1
|
||||
progressed = True
|
||||
if len(selected) >= count:
|
||||
break
|
||||
if not progressed:
|
||||
break
|
||||
|
||||
if len(selected) < count:
|
||||
fallback = select_initial_words(level, DEFAULT_TOPIC, count)
|
||||
for word in fallback:
|
||||
if word not in selected:
|
||||
selected.append(word)
|
||||
if len(selected) >= count:
|
||||
break
|
||||
return selected[:count]
|
||||
|
||||
payload = load_semantic_payload()
|
||||
normalized_topic = topic.strip().lower()
|
||||
abstract_like_topics = {"abstract", "actions"}
|
||||
@@ -408,6 +814,10 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
return selected_topic in topics
|
||||
|
||||
def semantic_matches(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
return selected_topic in semantic_topics and selected_topic not in topics
|
||||
|
||||
def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
quality = int(entry.get("quality_score", 0))
|
||||
@@ -479,6 +889,33 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_semantic_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", ""))
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
if selected_topic not in semantic_topics:
|
||||
return False
|
||||
if len(word) < 4 or len(word) > 13:
|
||||
return False
|
||||
if pos not in {"NOUN", "ADJ", "VERB"}:
|
||||
return False
|
||||
if word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
return False
|
||||
if "abstract" in topics:
|
||||
return False
|
||||
blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
|
||||
if any(part in word for part in blocked_substrings):
|
||||
return False
|
||||
required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
|
||||
if (
|
||||
selected_topic in CONCRETE_TOPICS
|
||||
and required_substrings
|
||||
and selected_topic != DEFAULT_TOPIC
|
||||
and not any(part in word for part in required_substrings)
|
||||
):
|
||||
return False
|
||||
return True
|
||||
|
||||
def overlap_score(left: str, right: str) -> int:
|
||||
shared = set(left) & set(right)
|
||||
return sum(min(left.count(ch), right.count(ch)) for ch in shared)
|
||||
@@ -548,6 +985,20 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)
|
||||
|
||||
selected = pick_seed_set(strict_pool, normalized_topic, count)
|
||||
if len(selected) < count and normalized_topic != DEFAULT_TOPIC:
|
||||
semantic_pool = [
|
||||
entry
|
||||
for entry in eligible
|
||||
if semantic_matches(entry, normalized_topic)
|
||||
and is_semantic_seed_friendly(entry, normalized_topic)
|
||||
]
|
||||
semantic_selected = pick_seed_set(semantic_pool, normalized_topic, count)
|
||||
for word in semantic_selected:
|
||||
if word not in selected:
|
||||
selected.append(word)
|
||||
if len(selected) >= count:
|
||||
break
|
||||
|
||||
if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
|
||||
relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
|
||||
for word in relaxed_selected:
|
||||
@@ -569,10 +1020,13 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]:
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
args.lexicon = resolve_runtime_lexicon_path(args.lexicon)
|
||||
ensure_vocabulary(args)
|
||||
ensure_lexicon(args)
|
||||
ensure_semantic_lexicon(args)
|
||||
difficulty_level = parse_difficulty(args.difficulty)
|
||||
active_topics = resolve_topics(args, difficulty_level)
|
||||
ensure_babelnet_enrichment(args)
|
||||
initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)
|
||||
|
||||
generator = CrosswordGenerator(
|
||||
@@ -590,7 +1044,13 @@ def main() -> None:
|
||||
print(f"Intersezioni: {initial_state.intersections}")
|
||||
print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
|
||||
print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
|
||||
print(f"Tema filler: {args.topic}")
|
||||
print(f"Topic attivi: {', '.join(active_topics)}")
|
||||
print(f"Lessico runtime: {args.lexicon.name}")
|
||||
if getattr(args, "topic_seed_counts", None):
|
||||
print(
|
||||
"Parole-seme disponibili per topic: "
|
||||
+ ", ".join(f"{topic}={count}" for topic, count in args.topic_seed_counts.items())
|
||||
)
|
||||
if args.seed is not None:
|
||||
print(f"Seed: {args.seed}")
|
||||
print()
|
||||
@@ -600,6 +1060,9 @@ def main() -> None:
|
||||
print(", ".join(initial_words))
|
||||
|
||||
if args.skip_fill:
|
||||
initial_words_for_clues = [placement.word for placement in initial_state.placements]
|
||||
enrich_words_for_definitions(args, initial_words_for_clues)
|
||||
print_definitions(args, initial_state)
|
||||
return
|
||||
|
||||
vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
|
||||
@@ -632,6 +1095,10 @@ def main() -> None:
|
||||
direction = "orizzontale" if placement.direction == "H" else "verticale"
|
||||
print(f"{index:>2}. {placement.word} ({placement.x}, {placement.y}) {direction}")
|
||||
|
||||
final_words = [placement.word for placement in final_state.placements]
|
||||
enrich_words_for_definitions(args, final_words)
|
||||
print_definitions(args, final_state)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
473
refine_lexicon_topics.py
Normal file
473
refine_lexicon_topics.py
Normal file
@@ -0,0 +1,473 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REFINED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined.json")
|
||||
|
||||
TOPIC_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"religion": (
|
||||
"abbazia",
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"cappella",
|
||||
"cardinale",
|
||||
"chiesa",
|
||||
"clero",
|
||||
"convento",
|
||||
"diocesi",
|
||||
"ecclesiast",
|
||||
"fede",
|
||||
"frate",
|
||||
"mistica",
|
||||
"monaco",
|
||||
"monastero",
|
||||
"parrocchia",
|
||||
"prete",
|
||||
"religion",
|
||||
"sacerdot",
|
||||
"santo",
|
||||
"vescovo",
|
||||
),
|
||||
"ecclesiastical_hierarchy": (
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"carica ecclesiastica",
|
||||
"cardinale",
|
||||
"clero",
|
||||
"dignità ecclesiastica",
|
||||
"ecclesiast",
|
||||
"ordinazione",
|
||||
"parroco",
|
||||
"patriarca",
|
||||
"pontefice",
|
||||
"prete",
|
||||
"priore",
|
||||
"superiore del monastero",
|
||||
"vescovo",
|
||||
),
|
||||
"honorific_title": (
|
||||
"carica",
|
||||
"epiteto",
|
||||
"nobile",
|
||||
"onore",
|
||||
"onorific",
|
||||
"titolo",
|
||||
),
|
||||
"mysticism": (
|
||||
"asceta",
|
||||
"contemplazione",
|
||||
"estasi",
|
||||
"mistica",
|
||||
"mistico",
|
||||
"monachesimo",
|
||||
"spiritual",
|
||||
),
|
||||
"geography": (
|
||||
"borgo",
|
||||
"città",
|
||||
"comune",
|
||||
"frazione",
|
||||
"geografia",
|
||||
"isola",
|
||||
"località",
|
||||
"paese",
|
||||
"provincia",
|
||||
"regione",
|
||||
"stato",
|
||||
"toponimo",
|
||||
"valle",
|
||||
),
|
||||
"transport": (
|
||||
"aereo",
|
||||
"aeroplano",
|
||||
"auto",
|
||||
"autobus",
|
||||
"autocarro",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"locomotiva",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"porto",
|
||||
"stazione",
|
||||
"traghetto",
|
||||
"treno",
|
||||
"trasport",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"nature": (
|
||||
"acqua",
|
||||
"albero",
|
||||
"animale",
|
||||
"bosco",
|
||||
"fiore",
|
||||
"fiume",
|
||||
"foresta",
|
||||
"mare",
|
||||
"montagna",
|
||||
"natura",
|
||||
"pianta",
|
||||
"terra",
|
||||
),
|
||||
"health": (
|
||||
"ambulanza",
|
||||
"anemia",
|
||||
"cura",
|
||||
"farmaco",
|
||||
"malattia",
|
||||
"medic",
|
||||
"ospedale",
|
||||
"paziente",
|
||||
"salute",
|
||||
"soccorso",
|
||||
"terapia",
|
||||
),
|
||||
"war": (
|
||||
"arma",
|
||||
"artiglieria",
|
||||
"assalto",
|
||||
"battaglia",
|
||||
"bombard",
|
||||
"esercito",
|
||||
"fortezza",
|
||||
"guerra",
|
||||
"militare",
|
||||
"soldato",
|
||||
"trincea",
|
||||
),
|
||||
}
|
||||
|
||||
TAG_STOPWORDS = {
|
||||
"and",
|
||||
"con",
|
||||
"da",
|
||||
"dei",
|
||||
"del",
|
||||
"della",
|
||||
"delle",
|
||||
"dello",
|
||||
"di",
|
||||
"e",
|
||||
"il",
|
||||
"in",
|
||||
"la",
|
||||
"le",
|
||||
"lo",
|
||||
"nel",
|
||||
"nella",
|
||||
"per",
|
||||
"su",
|
||||
"the",
|
||||
"un",
|
||||
"una",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Genera un lessico raffinato con campi aggiuntivi per topic, tag semantici e sensi."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="File lessicale di partenza, tipicamente lexicon_it_enriched.json.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Nuovo file lessicale raffinato da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace-general",
|
||||
action="store_true",
|
||||
help="Se attivo, sostituisce topic=['general'] con i topic suggeriti quando la confidenza e alta.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-topic-score",
|
||||
type=int,
|
||||
default=40,
|
||||
help="Punteggio minimo per promuovere un topic suggerito nei topics finali.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def slugify_tag(text: str) -> str:
|
||||
value = re.sub(r"[^a-z0-9]+", "_", text.strip().lower(), flags=re.IGNORECASE)
|
||||
value = value.strip("_")
|
||||
return value
|
||||
|
||||
|
||||
def cleanup_tag(tag: str) -> str:
|
||||
normalized = slugify_tag(tag)
|
||||
if not normalized or normalized in TAG_STOPWORDS or len(normalized) <= 1:
|
||||
return ""
|
||||
return normalized
|
||||
|
||||
|
||||
def flatten_text(entry: Dict[str, object]) -> str:
|
||||
chunks: List[str] = []
|
||||
chunks.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
chunks.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
chunks.extend(str(gloss) for gloss in semantic.get("glosses", []) or [])
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.append(str(synset.get("definition", "")))
|
||||
chunks.extend(str(item) for item in synset.get("lemmas", []) or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
chunks.extend(str(item) for item in babelnet.get("synset_refs", []) or [])
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
chunks.extend(str(item) for item in best_synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("senses", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
|
||||
return " ".join(chunks).lower()
|
||||
|
||||
|
||||
def infer_topic_scores(entry: Dict[str, object]) -> Dict[str, int]:
|
||||
text = flatten_text(entry)
|
||||
scores: Dict[str, int] = {}
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
occurrences = text.count(keyword.lower())
|
||||
if occurrences:
|
||||
score += 12 * occurrences
|
||||
if score:
|
||||
scores[topic] = min(score, 100)
|
||||
return scores
|
||||
|
||||
|
||||
def collect_semantic_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
tags.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
tags.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
for relation_group in (semantic.get("raw_relation_terms", {}) or {}).values():
|
||||
tags.extend(str(item) for item in relation_group or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
tags.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
tags.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
|
||||
cleaned = [cleanup_tag(tag) for tag in tags]
|
||||
return [tag for tag in dedupe(cleaned) if tag]
|
||||
|
||||
|
||||
def collect_senses(entry: Dict[str, object], topic_scores: Dict[str, int]) -> List[Dict[str, object]]:
|
||||
senses: List[Dict[str, object]] = []
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
definition = str(synset.get("definition", "")).strip()
|
||||
if not definition:
|
||||
continue
|
||||
senses.append(
|
||||
{
|
||||
"source": "semantic",
|
||||
"id": synset.get("id"),
|
||||
"definition": definition,
|
||||
"lemmas": dedupe(str(item) for item in synset.get("lemmas", []) or []),
|
||||
"topics": dedupe(
|
||||
list(semantic.get("semantic_topics", []) or [])
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": 0.7,
|
||||
}
|
||||
)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict) and best_synset.get("id"):
|
||||
glosses = [str(item).strip() for item in best_synset.get("glosses", []) or [] if str(item).strip()]
|
||||
if glosses:
|
||||
senses.append(
|
||||
{
|
||||
"source": "babelnet",
|
||||
"id": best_synset.get("id"),
|
||||
"definition": glosses[0],
|
||||
"lemmas": dedupe(str(item) for item in best_synset.get("senses", []) or []),
|
||||
"topics": dedupe(
|
||||
[str(best_synset.get("topic", "")).strip()]
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": round(min(max(float(best_synset.get("topic_score", 0)) / 100.0, 0.4), 0.95), 2),
|
||||
}
|
||||
)
|
||||
|
||||
return senses
|
||||
|
||||
|
||||
def collect_geo_tags(entry: Dict[str, object]) -> List[str]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
tags: List[str] = []
|
||||
if isinstance(babelnet, dict):
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for category in synset.get("categories", []) or []:
|
||||
text = str(category).lower()
|
||||
if any(keyword in text for keyword in ("comuni_", "province_", "regioni_", "città", "paesi", "località")):
|
||||
tags.append("toponym_possible")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def collect_name_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
if form[:1].isupper():
|
||||
tags.append("capitalized_form")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def should_review(entry: Dict[str, object], topic_scores: Dict[str, int], senses: List[Dict[str, object]]) -> bool:
|
||||
existing_topics = [str(topic).lower() for topic in entry.get("topics", []) or []]
|
||||
best_score = max(topic_scores.values(), default=0)
|
||||
strong_topics = [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
babelnet_status = str((entry.get("babelnet", {}) or {}).get("status", ""))
|
||||
|
||||
if existing_topics == ["general"] and not strong_topics:
|
||||
return True
|
||||
if babelnet_status == "ambiguous" and best_score < 50:
|
||||
return True
|
||||
if len(senses) >= 3 and len(strong_topics) >= 2:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def promoted_topics(
|
||||
existing_topics: List[str], topic_scores: Dict[str, int], replace_general: bool, min_topic_score: int
|
||||
) -> List[str]:
|
||||
inferred = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0])) if score >= min_topic_score]
|
||||
existing_clean = dedupe(existing_topics)
|
||||
|
||||
if replace_general and existing_clean == ["general"] and inferred:
|
||||
return inferred
|
||||
|
||||
return dedupe(existing_clean + inferred)
|
||||
|
||||
|
||||
def refine_entry(entry: Dict[str, object], replace_general: bool, min_topic_score: int) -> Dict[str, object]:
|
||||
refined = deepcopy(entry)
|
||||
topic_scores = infer_topic_scores(refined)
|
||||
semantic_tags = collect_semantic_tags(refined)
|
||||
senses = collect_senses(refined, topic_scores)
|
||||
geo_tags = collect_geo_tags(refined)
|
||||
name_tags = collect_name_tags(refined)
|
||||
current_topics = [str(topic) for topic in refined.get("topics", []) or []]
|
||||
|
||||
refined["topics"] = promoted_topics(current_topics, topic_scores, replace_general, min_topic_score)
|
||||
refined["semantic_tags"] = semantic_tags
|
||||
refined["senses"] = senses
|
||||
refined["topic_confidence"] = topic_scores
|
||||
refined["topic_suggestions"] = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0]))]
|
||||
refined["geo_tags"] = geo_tags
|
||||
refined["name_tags"] = name_tags
|
||||
refined["needs_review"] = should_review(refined, topic_scores, senses)
|
||||
return refined
|
||||
|
||||
|
||||
def build_refined_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico di input non valido: {args.input}")
|
||||
|
||||
refined_entries = [
|
||||
refine_entry(entry, args.replace_general, args.min_topic_score)
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
]
|
||||
|
||||
review_count = sum(1 for entry in refined_entries if entry.get("needs_review"))
|
||||
topicful_count = sum(1 for entry in refined_entries if len(entry.get("topic_suggestions", []) or []) > 0)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(refined_entries),
|
||||
"replace_general": args.replace_general,
|
||||
"min_topic_score": args.min_topic_score,
|
||||
"review_count": review_count,
|
||||
"topicful_count": topicful_count,
|
||||
},
|
||||
"entries": refined_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_refined_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico raffinato generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Voci con suggerimenti di topic: {payload['meta']['topicful_count']}")
|
||||
print(f"Voci marcate needs_review: {payload['meta']['review_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5
run_babelnet_daily_batch.bat
Normal file
5
run_babelnet_daily_batch.bat
Normal file
@@ -0,0 +1,5 @@
|
||||
@echo off
|
||||
setlocal
|
||||
cd /d "%~dp0"
|
||||
python babelnet_daily_batch.py --api-call-limit 1900 --per-key-api-call-limit 950 --sleep 0.2
|
||||
endlocal
|
||||
Reference in New Issue
Block a user