From 47d8957e15d71d92ad638ddb0c33ed05c9aac5dd Mon Sep 17 00:00:00 2001 From: allebonvi Date: Wed, 29 Apr 2026 13:24:04 +0200 Subject: [PATCH] alpha01 backoffice: crossword engine, lexicon curation and JSON contract --- .gitignore | 14 + apply_llm_rescue_patch.py | 162 ++++++ babelnet_daily_batch.py | 490 ++++++++++++++++ babelnet_incremental_enricher.py | 583 +++++++++++++++++++ build_babelnet_enrichment.py | 110 +++- build_enriched_lexicon.py | 324 +++++++++++ build_llm_rescue_patch.py | 429 ++++++++++++++ build_review_priority.py | 182 ++++++ build_treccani_rescue_patch.py | 153 +++++ clue_generator.py | 423 ++++++++++++++ crossword_contract.md | 209 +++++++ crossword_contract_example_request.json | 37 ++ crossword_contract_example_response.json | 138 +++++ crossword_filler.py | 13 +- curate_lexicon_alpha.py | 611 ++++++++++++++++++++ enrich_review_from_wiktextract_file.py | 492 ++++++++++++++++ enrich_review_from_wiktionary.py | 678 +++++++++++++++++++++++ main.py | 475 +++++++++++++++- refine_lexicon_topics.py | 473 ++++++++++++++++ run_babelnet_daily_batch.bat | 5 + 20 files changed, 5985 insertions(+), 16 deletions(-) create mode 100644 .gitignore create mode 100644 apply_llm_rescue_patch.py create mode 100644 babelnet_daily_batch.py create mode 100644 babelnet_incremental_enricher.py create mode 100644 build_enriched_lexicon.py create mode 100644 build_llm_rescue_patch.py create mode 100644 build_review_priority.py create mode 100644 build_treccani_rescue_patch.py create mode 100644 clue_generator.py create mode 100644 crossword_contract.md create mode 100644 crossword_contract_example_request.json create mode 100644 crossword_contract_example_response.json create mode 100644 curate_lexicon_alpha.py create mode 100644 enrich_review_from_wiktextract_file.py create mode 100644 enrich_review_from_wiktionary.py create mode 100644 refine_lexicon_topics.py create mode 100644 run_babelnet_daily_batch.bat diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..95871fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +__pycache__/ +*.pyc +.babelnet_cache.json +.wiktionary_cache.json +.wiktextract_it_index.json +.babelnet_api_key.local +logs/ +raw-wiktextract-data.jsonl +lexicon_it*.json +llm_rescue_patch.json +treccani_rescue_patch.json +to_be_review*.json +_*.json +idee.txt diff --git a/apply_llm_rescue_patch.py b/apply_llm_rescue_patch.py new file mode 100644 index 0000000..4484b68 --- /dev/null +++ b/apply_llm_rescue_patch.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path +from typing import Any, Dict, List, Tuple + + +DEFAULT_LEXICON_PATH = Path(__file__).with_name("lexicon_it_curated.json") +DEFAULT_PATCH_PATH = Path(__file__).with_name("llm_rescue_patch.json") +DEFAULT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated_llm.json") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Applica una patch LLM rescue al lessico curato per produrre un lessico operativo aggiornato." + ) + parser.add_argument("--lexicon", type=Path, default=DEFAULT_LEXICON_PATH, help="Lessico curato di partenza.") + parser.add_argument("--patch", type=Path, default=DEFAULT_PATCH_PATH, help="Patch LLM rescue da applicare.") + parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT_PATH, help="Lessico aggiornato in uscita.") + parser.add_argument( + "--min-confidence", + type=float, + default=0.6, + help="Confidenza minima per applicare automaticamente una definizione rescue.", + ) + parser.add_argument( + "--include-needs-review", + action="store_true", + help="Applica anche voci marcate needs_human_review=true se superano la soglia di confidenza.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def normalize_key(form: str, pos: str) -> Tuple[str, str]: + return (str(form or "").strip().lower(), str(pos or "").strip().upper()) + + +def merge_topics(existing: List[str], incoming: List[str]) -> List[str]: + merged: List[str] = [] + seen = set() + for item in list(existing or []) + list(incoming or []): + value = str(item).strip() + if not value: + continue + key = value.lower() + if key in seen: + continue + seen.add(key) + merged.append(value) + return merged + + +def apply_patch(args: argparse.Namespace) -> Dict[str, Any]: + lexicon_payload = load_json(args.lexicon, {"entries": []}) + patch_payload = load_json(args.patch, {"entries": []}) + if not isinstance(lexicon_payload, dict): + raise ValueError(f"Lessico non valido: {args.lexicon}") + lexicon = lexicon_payload.get("entries") + if not isinstance(lexicon, list): + raise ValueError(f"Lessico non valido: {args.lexicon}") + if not isinstance(patch_payload, dict): + raise ValueError(f"Patch non valida: {args.patch}") + + patch_entries = patch_payload.get("entries") or [] + patch_by_key = {} + for entry in patch_entries: + if not isinstance(entry, dict): + continue + patch_by_key[normalize_key(entry.get("form", ""), entry.get("pos", ""))] = entry + + applied = 0 + skipped = 0 + for entry in lexicon: + if not isinstance(entry, dict): + continue + patch = patch_by_key.get(normalize_key(entry.get("form", ""), entry.get("pos", ""))) + if not patch: + continue + confidence = float(patch.get("confidence", 0.0) or 0.0) + needs_review = bool(patch.get("needs_human_review", True)) + definition = str(patch.get("rescue_definition", "")).strip() + if not definition: + skipped += 1 + continue + if confidence < float(args.min_confidence): + skipped += 1 + continue + if needs_review and not args.include_needs_review: + skipped += 1 + continue + + entry["preferred_definition"] = definition + entry["preferred_source"] = patch.get("rescue_source", "llm_rescue") + clue_defs = entry.get("clue_definitions") or {} + if not isinstance(clue_defs, dict): + clue_defs = {} + for level in ("easy", "medium", "hard", "expert"): + clue_defs[level] = definition + entry["clue_definitions"] = clue_defs + + entry["topics"] = merge_topics(entry.get("topics", []), patch.get("rescue_topics", [])) + entry["semantic_tags"] = merge_topics(entry.get("semantic_tags", []), patch.get("rescue_semantic_tags", [])) + entry["alpha_ready"] = True + review_reasons = [reason for reason in (entry.get("review_reasons") or []) if reason != "no_viable_definition"] + if not args.include_needs_review: + review_reasons = [reason for reason in review_reasons if reason != "flagged_by_refined_stage"] + entry["review_reasons"] = review_reasons + entry["llm_rescue"] = { + "definition": definition, + "source": patch.get("rescue_source", "llm_rescue"), + "topics": patch.get("rescue_topics", []), + "semantic_tags": patch.get("rescue_semantic_tags", []), + "notes": patch.get("rescue_notes", ""), + "confidence": confidence, + "needs_human_review": needs_review, + "status": patch.get("status", ""), + } + applied += 1 + + meta = dict(lexicon_payload.get("meta") or {}) + meta["base_lexicon"] = args.lexicon.name + meta["generated_from_patch"] = args.patch.name + meta["generated_by"] = "apply_llm_rescue_patch.py" + meta["entry_count"] = len(lexicon) + meta["llm_rescue_applied"] = applied + meta["llm_rescue_skipped"] = skipped + meta["alpha_ready_count"] = sum(1 for item in lexicon if isinstance(item, dict) and item.get("alpha_ready")) + meta["review_count"] = sum( + 1 + for item in lexicon + if isinstance(item, dict) and (item.get("review_reasons") or item.get("needs_review")) + ) + output_payload = {"meta": meta, "entries": lexicon} + write_json(args.output, output_payload) + return { + "applied": applied, + "skipped": skipped, + "output": str(args.output), + } + + +def main() -> None: + args = parse_args() + result = apply_patch(args) + print(f"Lessico aggiornato generato: {result['output']}") + print(f"Patch applicate: {result['applied']}") + print(f"Voci saltate: {result['skipped']}") + + +if __name__ == "__main__": + main() diff --git a/babelnet_daily_batch.py b/babelnet_daily_batch.py new file mode 100644 index 0000000..2f34a20 --- /dev/null +++ b/babelnet_daily_batch.py @@ -0,0 +1,490 @@ +from __future__ import annotations + +import argparse +import json +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import Dict, Iterable, List, Optional, Tuple + +from babelnet_incremental_enricher import ( + DEFAULT_TOPIC, + merge_babelnet_entries, + rebuild_enriched, +) +from build_babelnet_enrichment import ( + BABELNET_CACHE_PATH, + BABELNET_ENV_KEY, + BABELNET_OUTPUT_PATH, + BabelNetApiCallLimitReached, + BabelNetKeyUnavailable, + POS_TO_BABELNET, + enrich_entry, + load_babelnet_api_keys, + load_json, + write_json, +) +from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH +from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH + + +LOG_DIR = Path(__file__).with_name("logs") +DEFAULT_API_CALL_LIMIT = 950 +DEFAULT_PER_KEY_API_CALL_LIMIT = 950 +DEFAULT_WORD_LIMIT = 10_000 +MIN_WORD_LENGTH = 3 +MAX_WORD_LENGTH = 16 +USEFUL_POS_PRIORITY = { + "NOUN": 6, + "VERB": 5, + "ADJ": 4, + "ADV": 3, +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Batch giornaliero per fondere progressivamente ItalWordNet e BabelNet: " + "arricchisce parole mancanti, aggiorna lexicon_it_babelnet.json e rigenera lexicon_it_enriched.json." + ) + ) + parser.add_argument( + "--api-call-limit", + type=int, + default=DEFAULT_API_CALL_LIMIT, + help="Numero massimo complessivo di chiamate API BabelNet reali consentite in questa esecuzione.", + ) + parser.add_argument( + "--per-key-api-call-limit", + type=int, + default=DEFAULT_PER_KEY_API_CALL_LIMIT, + help="Numero massimo di chiamate API reali consentite per ciascuna chiave caricata.", + ) + parser.add_argument( + "--token-index", + default=None, + help="Usa una o piu chiavi locali, contando da 1. Esempi: --token-index 2 oppure --token-index 1,2,3.", + ) + parser.add_argument( + "--token-indexes", + default=None, + help="Alias esplicito per una lista di chiavi locali. Esempio: --token-indexes 1,2,3.", + ) + parser.add_argument( + "--word-limit", + type=int, + default=DEFAULT_WORD_LIMIT, + help="Numero massimo di parole candidate da tentare in questa esecuzione.", + ) + parser.add_argument( + "--sleep", + type=float, + default=0.2, + help="Pausa tra richieste API.", + ) + parser.add_argument( + "--topic", + default=None, + help="Topic opzionale per concentrare il batch su una parte del lessico.", + ) + parser.add_argument( + "--include-not-crossword", + action="store_true", + help="Include anche voci non marcate allowed_in_crossword.", + ) + parser.add_argument( + "--retry-no-match", + action="store_true", + help="Riprova anche parole gia marcate come no_match.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Mostra le prossime parole candidate senza chiamare BabelNet e senza scrivere file.", + ) + parser.add_argument( + "--ignore-cache", + action="store_true", + help="Ignora la cache in questa esecuzione diagnostica, utile per testare un token specifico.", + ) + parser.add_argument( + "--semantic", + type=Path, + default=SEMANTIC_LEXICON_OUTPUT_PATH, + help="Lessico semantico completo di partenza.", + ) + parser.add_argument( + "--babelnet", + type=Path, + default=BABELNET_OUTPUT_PATH, + help="Archivio incrementale degli arricchimenti BabelNet.", + ) + parser.add_argument( + "--enriched", + type=Path, + default=ENRICHED_LEXICON_OUTPUT_PATH, + help="Lessico fuso da rigenerare dopo il batch.", + ) + return parser.parse_args() + + +def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: + form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() + pos = str(entry.get("pos") or "").strip().upper() + return form, pos + + +def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]: + if enriched_path.exists(): + payload = load_json(enriched_path, {}) + if isinstance(payload, dict) and "entries" in payload: + return payload + payload = load_json(semantic_path, {}) + if isinstance(payload, dict) and "entries" in payload: + return payload + raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}") + + +def babelnet_status(entry: Dict[str, object]) -> str: + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + return str(babelnet.get("status", "not_requested")) + return "not_requested" + + +def entry_topics(entry: Dict[str, object]) -> set[str]: + topics = {str(item).lower() for item in entry.get("topics", []) or [] if item} + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + topics.update(str(item).lower() for item in semantic.get("semantic_topics", []) or [] if item) + return topics + + +def eligible_entry(entry: Dict[str, object], args: argparse.Namespace) -> bool: + word = str(entry.get("form", "")).strip().lower() + pos = str(entry.get("pos", "")).strip().upper() + status = babelnet_status(entry) + allowed_statuses = {"not_requested", "api_error"} + if args.retry_no_match: + allowed_statuses.add("no_match") + + if status not in allowed_statuses: + return False + if pos not in POS_TO_BABELNET: + return False + if not word.isalpha() or not MIN_WORD_LENGTH <= len(word) <= MAX_WORD_LENGTH: + return False + if not args.include_not_crossword and not entry.get("allowed_in_crossword", False): + return False + if args.topic and args.topic.strip().lower() not in entry_topics(entry): + return False + return True + + +def candidate_priority(entry: Dict[str, object]) -> Tuple[int, int, int, int, int, str]: + word = str(entry.get("form", "")) + pos = str(entry.get("pos", "")).upper() + topics = {str(item).lower() for item in entry.get("topics", []) or []} + semantic = entry.get("semantic", {}) + semantic_topics = set() + if isinstance(semantic, dict): + semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) or []} + + useful_topic_bonus = 2 if topics - {DEFAULT_TOPIC, "abstract", "actions"} else 0 + semantic_topic_bonus = 1 if semantic_topics else 0 + length_bonus = 3 if 4 <= len(word) <= 11 else 1 + return ( + useful_topic_bonus, + semantic_topic_bonus, + int(entry.get("quality_score", 0)), + USEFUL_POS_PRIORITY.get(pos, 0), + length_bonus, + word, + ) + + +def select_candidates(payload: Dict[str, object], args: argparse.Namespace) -> List[Dict[str, object]]: + candidates = [ + entry + for entry in payload.get("entries", []) or [] + if isinstance(entry, dict) and eligible_entry(entry, args) + ] + candidates.sort(key=candidate_priority, reverse=True) + return candidates[: max(0, args.word_limit)] + + +def progress_counts(payload: Dict[str, object]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for entry in payload.get("entries", []) or []: + if not isinstance(entry, dict): + continue + status = babelnet_status(entry) + counts[status] = counts.get(status, 0) + 1 + return counts + + +def parse_token_indexes(value: Optional[str], key_count: int, option_name: str) -> Optional[List[int]]: + if value is None: + return None + + selected: List[int] = [] + seen = set() + for raw_part in str(value).replace(";", ",").split(","): + part = raw_part.strip() + if not part: + continue + try: + index = int(part) + except ValueError as exc: + raise SystemExit(f"{option_name} deve contenere solo numeri separati da virgola.") from exc + if not 1 <= index <= key_count: + raise SystemExit( + f"{option_name} contiene {index}, ma deve essere tra 1 e {key_count}. Chiavi caricate: {key_count}." + ) + if index in seen: + continue + selected.append(index) + seen.add(index) + + if not selected: + raise SystemExit(f"{option_name} non contiene nessun indice valido.") + return selected + + +def write_batch_log(payload: Dict[str, object]) -> Path: + LOG_DIR.mkdir(exist_ok=True) + timestamp = datetime.now().astimezone().strftime("%Y%m%d_%H%M%S") + path = LOG_DIR / f"babelnet_batch_{timestamp}.json" + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + return path + + +def run_batch(args: argparse.Namespace) -> Dict[str, object]: + source_payload = load_source_payload(args.enriched, args.semantic) + candidates = select_candidates(source_payload, args) + before_counts = progress_counts(source_payload) + + if args.dry_run: + return { + "mode": "dry-run", + "candidate_count": len(candidates), + "selected_words": [entry.get("form") for entry in candidates[:50]], + "before_counts": before_counts, + } + + api_keys = load_babelnet_api_keys() + if not api_keys: + raise SystemExit( + f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure crea .babelnet_api_key.local." + ) + token_indexes = parse_token_indexes(args.token_index, len(api_keys), "--token-index") + token_indexes_alias = parse_token_indexes(args.token_indexes, len(api_keys), "--token-indexes") + if token_indexes and token_indexes_alias: + raise SystemExit("Usa solo uno tra --token-index e --token-indexes.") + selected_token_indexes = token_indexes or token_indexes_alias + if selected_token_indexes: + api_keys = [api_keys[index - 1] for index in selected_token_indexes] + + cache = {} if args.ignore_cache else load_json(BABELNET_CACHE_PATH, {}) + if not isinstance(cache, dict): + cache = {} + babelnet_payload = load_json(args.babelnet, {"entries": []}) + if not isinstance(babelnet_payload, dict): + babelnet_payload = {"entries": []} + + global_stats = { + "api_calls": 0, + "cache_hits": 0, + "responses": 0, + "api_call_limit": max(0, args.api_call_limit), + } + per_key_limit = max(0, args.per_key_api_call_limit) + key_stats = [ + { + "key_index": selected_token_indexes[index] if selected_token_indexes else index + 1, + "local_key_index": index + 1, + "api_calls": 0, + "cache_hits": 0, + "responses": 0, + "api_call_limit": per_key_limit, + } + for index, _ in enumerate(api_keys) + ] + enriched_entries: List[Dict[str, object]] = [] + word_logs = [] + stopped_reason = "completed" + + def select_key_index() -> Optional[int]: + available = [ + (stats["api_calls"], index) + for index, stats in enumerate(key_stats) + if stats["api_calls"] < stats["api_call_limit"] + ] + if not available: + return None + available.sort() + return available[0][1] + + for index, entry in enumerate(candidates, start=1): + if global_stats["api_calls"] >= global_stats["api_call_limit"]: + stopped_reason = "api_call_limit" + break + key_index = select_key_index() + if key_index is None: + stopped_reason = "per_key_api_call_limit" + break + + before_api_calls = global_stats["api_calls"] + before_cache_hits = global_stats["cache_hits"] + before_responses = global_stats["responses"] + before_key_api_calls = key_stats[key_index]["api_calls"] + before_key_cache_hits = key_stats[key_index]["cache_hits"] + before_key_responses = key_stats[key_index]["responses"] + + updated = deepcopy(entry) + updated.pop("babelnet", None) + try: + updated["babelnet"] = enrich_entry(updated, api_keys[key_index], cache, args.sleep, key_stats[key_index]) + except BabelNetApiCallLimitReached: + global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls + global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits + global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses + stopped_reason = "per_key_api_call_limit" + break + except BabelNetKeyUnavailable as exc: + global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls + global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits + global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses + key_stats[key_index]["api_calls"] = key_stats[key_index]["api_call_limit"] + word_logs.append( + { + "index": index, + "word": updated.get("form"), + "pos": updated.get("pos"), + "key_index": key_stats[key_index]["key_index"], + "api_calls": global_stats["api_calls"] - before_api_calls, + "cache_hits": global_stats["cache_hits"] - before_cache_hits, + "responses": global_stats["responses"] - before_responses, + "matched": False, + "synsets": 0, + "reason": "key_unavailable_or_daily_limit", + "error": str(exc), + } + ) + print( + f"[{index}/{len(candidates)}] {updated.get('form')}: " + f"token={key_stats[key_index]['key_index']} non disponibile o limite giornaliero raggiunto" + ) + if select_key_index() is None: + stopped_reason = "all_keys_unavailable_or_daily_limit" + break + continue + + global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls + global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits + global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses + + enriched_entries.append(updated) + write_json(BABELNET_CACHE_PATH, cache) + + word_log = { + "index": index, + "word": updated.get("form"), + "pos": updated.get("pos"), + "key_index": key_stats[key_index]["key_index"], + "api_calls": global_stats["api_calls"] - before_api_calls, + "cache_hits": global_stats["cache_hits"] - before_cache_hits, + "responses": global_stats["responses"] - before_responses, + "matched": bool(updated.get("babelnet", {}).get("matched")), + "synsets": len(updated.get("babelnet", {}).get("synsets", []) or []), + "reason": updated.get("babelnet", {}).get("reason"), + } + word_logs.append(word_log) + print( + f"[{index}/{len(candidates)}] {word_log['word']}: " + f"token={word_log['key_index']} api_calls={word_log['api_calls']} cache_hits={word_log['cache_hits']} " + f"match={word_log['matched']} tot_api={global_stats['api_calls']}/{global_stats['api_call_limit']}" + ) + + merged_babelnet = merge_babelnet_entries( + babelnet_payload, + enriched_entries, + args.topic or "all", + "all", + ) + write_json(args.babelnet, merged_babelnet) + enriched_payload = rebuild_enriched( + args.semantic, + args.babelnet, + args.enriched, + args.topic or DEFAULT_TOPIC, + ) + after_counts = progress_counts(enriched_payload) + + total_entries = int(enriched_payload.get("meta", {}).get("entry_count", 0)) + covered = total_entries - after_counts.get("not_requested", 0) + coverage = covered / total_entries if total_entries else 0.0 + + result = { + "mode": "batch", + "started_topic": args.topic, + "stopped_reason": stopped_reason, + "candidate_count": len(candidates), + "attempted_words": len(enriched_entries), + "matched_words": sum(1 for entry in enriched_entries if entry.get("babelnet", {}).get("matched")), + "api_calls": global_stats["api_calls"], + "cache_hits": global_stats["cache_hits"], + "responses": global_stats["responses"], + "api_call_limit": global_stats["api_call_limit"], + "api_key_count": len(api_keys), + "forced_token_indexes": selected_token_indexes, + "per_key_api_call_limit": per_key_limit, + "per_key_stats": key_stats, + "before_counts": before_counts, + "after_counts": after_counts, + "total_entries": total_entries, + "covered_entries": covered, + "coverage_ratio": coverage, + "word_logs": word_logs, + } + log_path = write_batch_log(result) + result["log_path"] = str(log_path) + return result + + +def print_result(result: Dict[str, object]) -> None: + if result["mode"] == "dry-run": + print("Dry-run batch BabelNet") + print(f"Candidate selezionate: {result['candidate_count']}") + print(f"Stati iniziali: {result['before_counts']}") + print("Prime parole:") + for index, word in enumerate(result["selected_words"], start=1): + print(f"{index:>2}. {word}") + return + + print("Batch BabelNet completato") + print(f"- motivo stop: {result['stopped_reason']}") + print(f"- parole tentate: {result['attempted_words']}/{result['candidate_count']}") + print(f"- parole con match: {result['matched_words']}") + print(f"- chiamate API reali: {result['api_calls']}/{result['api_call_limit']}") + print(f"- chiavi caricate: {result['api_key_count']} (limite per chiave: {result['per_key_api_call_limit']})") + if result.get("forced_token_indexes"): + print(f"- token forzati: {', '.join('#' + str(index) for index in result['forced_token_indexes'])}") + for item in result["per_key_stats"]: + print(f" chiave #{item['key_index']}: {item['api_calls']}/{item['api_call_limit']} chiamate API") + print(f"- cache hit: {result['cache_hits']}") + print(f"- copertura lessico: {result['covered_entries']}/{result['total_entries']} ({result['coverage_ratio'] * 100:.1f}%)") + print(f"- stati dopo: {result['after_counts']}") + print(f"- log: {result['log_path']}") + + +def main() -> None: + args = parse_args() + result = run_batch(args) + print_result(result) + + +if __name__ == "__main__": + main() diff --git a/babelnet_incremental_enricher.py b/babelnet_incremental_enricher.py new file mode 100644 index 0000000..c0ca422 --- /dev/null +++ b/babelnet_incremental_enricher.py @@ -0,0 +1,583 @@ +from __future__ import annotations + +import argparse +import os +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from types import SimpleNamespace +from typing import Dict, Iterable, List, Optional, Tuple + +from build_babelnet_enrichment import ( + BABELNET_CACHE_PATH, + BABELNET_ENV_KEY, + BABELNET_OUTPUT_PATH, + POS_TO_BABELNET, + enrich_entry, + load_json, + write_json, +) +from build_enriched_lexicon import ( + ENRICHED_LEXICON_OUTPUT_PATH, + build_enriched_lexicon, + write_json as write_enriched_json, +) +from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH + + +DIFFICULTY_ALIASES: Dict[str, int] = { + "easy": 1, + "medium": 2, + "hard": 4, + "expert": 5, +} +DEFAULT_TOPIC = "general" +ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo") +FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"} +GENERAL_FILL_MIN_QUALITY = 6 +GENERAL_FILL_MAX_LENGTH = 10 +SOFT_RELATED_FILL_LIMIT = 120 +CONCRETE_TOPICS = { + "animals", + "plants", + "nature", + "ecology", + "geography", + "weather", + "sea", + "mountain", + "health", + "science", + "sport", + "history", + "school", + "cinema", + "literature", + "food", + "city", + "transport", + "work", + "home", +} + +TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = { + "transport": ( + "auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer", + "bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc", + "trattor", "carr", "vap", "rota", "ruot", + ), + "animals": ( + "can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall", + "serpent", "tig", "leon", "volp", "cerv", "capr", "pecor", + ), + "nature": ( + "mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol", + "rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent", + "litoral", "piogg", "nev", "onda", "clim", + ), + "cinema": ( + "film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog", + "comic", "div", "docu", "pellic", "spettacol", + ), +} + +TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = { + "transport": ( + "intervist", "intratten", "speriment", "stermin", "investig", + "intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt", + "eccit", "traduz", "fluttu", "sollecit", + ), + "animals": ( + "assicur", "finanz", "coediz", "camerier", "servitor", "indic", + "estens", "diffus", "difensor", "spessor", "maggior", + ), + "cinema": ( + "manifest", "riediz", "dissimul", "diffus", "difensor", "estens", + "malumor", "eversor", + ), +} + +ENRICHABLE_STATUSES = {"not_requested", "api_error"} + +BABELNET_TOPIC_SAFE_PREFIXES: Dict[str, Tuple[str, ...]] = { + "transport": ( + "ambul", + "aer", + "autobus", + "autocar", + "automob", + "autostrad", + "autoveic", + "autovett", + "bicicl", + "ciclo", + "imbarc", + "locom", + "motoc", + "motr", + "navig", + "rimorch", + "trattor", + "tren", + "veicol", + "vettur", + ), +} + + +def parse_difficulty(value: str) -> int: + text = str(value).strip().lower() + if text in DIFFICULTY_ALIASES: + return DIFFICULTY_ALIASES[text] + try: + level = int(text) + except ValueError as exc: + raise SystemExit( + "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5." + ) from exc + if not 1 <= level <= 5: + raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.") + return level + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Arricchisce incrementalmente il lessico: seleziona parole mancanti, " + "chiama BabelNet entro un limite e rigenera lexicon_it_enriched.json." + ) + ) + parser.add_argument( + "--api-key", + default=os.environ.get(BABELNET_ENV_KEY), + help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.", + ) + parser.add_argument( + "--topic", + default=DEFAULT_TOPIC, + help="Topic per cui scegliere le prossime parole da arricchire.", + ) + parser.add_argument( + "--difficulty", + default="medium", + help="Difficolta massima: easy, medium, hard, expert oppure 1-5.", + ) + parser.add_argument( + "--limit", + type=int, + default=50, + help="Numero massimo di parole da arricchire in questa esecuzione.", + ) + parser.add_argument( + "--sleep", + type=float, + default=0.2, + help="Pausa tra richieste API.", + ) + parser.add_argument( + "--semantic", + type=Path, + default=SEMANTIC_LEXICON_OUTPUT_PATH, + help="Lessico semantico completo di partenza.", + ) + parser.add_argument( + "--babelnet", + type=Path, + default=BABELNET_OUTPUT_PATH, + help="Archivio degli arricchimenti BabelNet parziali.", + ) + parser.add_argument( + "--enriched", + type=Path, + default=ENRICHED_LEXICON_OUTPUT_PATH, + help="Lessico arricchito da aggiornare.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Mostra le parole candidate senza chiamare BabelNet e senza scrivere file.", + ) + parser.add_argument( + "--retry-no-match", + action="store_true", + help="Riprova anche parole gia marcate come no_match.", + ) + parser.add_argument( + "--words", + nargs="*", + default=None, + help="Parole specifiche da arricchire, utile per generare definizioni sul cruciverba finale.", + ) + return parser.parse_args() + + +def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: + form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() + pos = str(entry.get("pos") or "").strip().upper() + return form, pos + + +def dedupe(items: Iterable[Dict[str, object]]) -> List[Dict[str, object]]: + seen = set() + result = [] + for item in items: + key = entry_key(item) + if key in seen: + continue + seen.add(key) + result.append(item) + return result + + +def entry_topics(entry: Dict[str, object]) -> Tuple[set[str], set[str]]: + topics = {str(item).lower() for item in entry.get("topics", []) if item} + semantic = entry.get("semantic", {}) + semantic_topics = set() + if isinstance(semantic, dict): + semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item} + return topics, semantic_topics + + +def current_babelnet_status(entry: Dict[str, object]) -> str: + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + return str(babelnet.get("status", "not_requested")) + return "not_requested" + + +def matches_topic_roots(word: str, topic: str) -> bool: + roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(topic, ()) + return not roots or any(root in word for root in roots) + + +def matches_safe_babelnet_roots(word: str, topic: str) -> bool: + prefixes = BABELNET_TOPIC_SAFE_PREFIXES.get(topic) + if prefixes is None: + return False + return any(word.startswith(prefix) for prefix in prefixes) + + +def is_blocked_for_topic(word: str, topic: str) -> bool: + return any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(topic, ())) + + +def topic_score(entry: Dict[str, object], topic: str) -> int: + if topic == DEFAULT_TOPIC: + return 20 + + word = str(entry.get("form", "")).lower() + topics, semantic_topics = entry_topics(entry) + score = 0 + if topic in topics: + score += 100 + if topic in semantic_topics: + score += 45 + if matches_topic_roots(word, topic): + score += 35 + if DEFAULT_TOPIC in topics: + score += 5 + if is_blocked_for_topic(word, topic): + score -= 100 + if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES): + score -= 30 + return score + + +def candidate_score(entry: Dict[str, object], topic: str) -> Tuple[int, int, int, int, int, str]: + word = str(entry.get("form", "")) + pos = str(entry.get("pos", "")) + pos_bonus = { + "NOUN": 12, + "VERB": 8, + "ADJ": 6, + "ADV": 4, + }.get(pos, 0) + semantic = entry.get("semantic", {}) + semantic_bonus = 3 if isinstance(semantic, dict) and semantic.get("matched") else 0 + length_bonus = 4 if 4 <= len(word) <= 10 else 1 if len(word) <= 14 else -3 + return ( + topic_score(entry, topic), + int(entry.get("quality_score", 0)), + pos_bonus, + semantic_bonus, + length_bonus, + word, + ) + + +def eligible_for_babelnet(entry: Dict[str, object], topic: str, difficulty_level: int, retry_no_match: bool) -> bool: + word = str(entry.get("form", "")).lower() + pos = str(entry.get("pos", "")) + topics, semantic_topics = entry_topics(entry) + status = current_babelnet_status(entry) + allowed_statuses = set(ENRICHABLE_STATUSES) + if retry_no_match: + allowed_statuses.add("no_match") + + if status not in allowed_statuses: + return False + if not word.isalpha() or len(word) < 3 or len(word) > 16: + return False + if pos not in POS_TO_BABELNET or pos not in FILL_ALLOWED_POS: + return False + if int(entry.get("difficulty_word", 5)) > difficulty_level: + return False + if not entry.get("allowed_in_crossword", False): + return False + if topic != DEFAULT_TOPIC: + if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES): + return False + conservative_match = topic in topics + safe_root_match = matches_safe_babelnet_roots(word, topic) + semantic_only_match = topic in semantic_topics and topic not in CONCRETE_TOPICS + if not (conservative_match or safe_root_match or semantic_only_match): + return False + return True + + +def select_candidates(payload: Dict[str, object], topic: str, difficulty_level: int, limit: int, retry_no_match: bool) -> List[Dict[str, object]]: + entries = [ + entry + for entry in payload.get("entries", []) or [] + if isinstance(entry, dict) and eligible_for_babelnet(entry, topic, difficulty_level, retry_no_match) + ] + + if topic != DEFAULT_TOPIC: + strong = [entry for entry in entries if topic in entry_topics(entry)[0]] + soft = [ + entry + for entry in entries + if entry not in strong + and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY + and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH + ] + support = [ + entry + for entry in entries + if entry not in strong + and entry not in soft + and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY + and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES) + ] + entries = strong + sorted(soft, key=lambda item: candidate_score(item, topic), reverse=True)[:SOFT_RELATED_FILL_LIMIT] + entries += sorted(support, key=lambda item: candidate_score(item, topic), reverse=True) + + entries = dedupe(entries) + entries.sort(key=lambda item: candidate_score(item, topic), reverse=True) + return entries[:limit] + + +def select_word_candidates( + payload: Dict[str, object], + words: Iterable[str], + limit: int, + retry_no_match: bool, +) -> List[Dict[str, object]]: + requested = [] + seen_words = set() + for word in words: + normalized = str(word).strip().lower() + if normalized and normalized not in seen_words: + requested.append(normalized) + seen_words.add(normalized) + + by_word = { + str(entry.get("form", "")).lower(): entry + for entry in payload.get("entries", []) or [] + if isinstance(entry, dict) + } + selected = [] + allowed_statuses = set(ENRICHABLE_STATUSES) + if retry_no_match: + allowed_statuses.add("no_match") + + for word in requested: + entry = by_word.get(word) + if not entry: + continue + status = current_babelnet_status(entry) + if status not in allowed_statuses: + continue + if str(entry.get("pos", "")) not in POS_TO_BABELNET: + continue + if not str(entry.get("form", "")).isalpha(): + continue + selected.append(entry) + if len(selected) >= limit: + break + + return selected + + +def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]: + if enriched_path.exists(): + payload = load_json(enriched_path, {}) + if isinstance(payload, dict) and "entries" in payload: + return payload + payload = load_json(semantic_path, {}) + if isinstance(payload, dict) and "entries" in payload: + return payload + raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}") + + +def merge_babelnet_entries(existing_payload: Dict[str, object], new_entries: List[Dict[str, object]], topic: str, difficulty: str) -> Dict[str, object]: + existing_entries = [ + entry for entry in existing_payload.get("entries", []) or [] if isinstance(entry, dict) + ] + index = {entry_key(entry): deepcopy(entry) for entry in existing_entries} + generated_at = datetime.now().astimezone().isoformat(timespec="seconds") + + for entry in new_entries: + updated = deepcopy(entry) + updated["babelnet_generated_at"] = generated_at + index[entry_key(updated)] = updated + + entries = sorted(index.values(), key=lambda item: (str(item.get("form", "")), str(item.get("pos", "")))) + meta = dict(existing_payload.get("meta", {})) if isinstance(existing_payload.get("meta", {}), dict) else {} + meta.update( + { + "language": meta.get("language", "it"), + "version": max(1, int(meta.get("version", 1))), + "source": "BabelNet API", + "updated_at": generated_at, + "last_topic": topic, + "last_difficulty": difficulty, + "entry_count": len(entries), + } + ) + return {"meta": meta, "entries": entries} + + +def rebuild_enriched(semantic_path: Path, babelnet_path: Path, enriched_path: Path, topic: str) -> Dict[str, object]: + namespace = SimpleNamespace( + semantic=semantic_path, + babelnet=babelnet_path, + output=enriched_path, + topic=topic, + ) + payload = build_enriched_lexicon(namespace) + write_enriched_json(enriched_path, payload) + return payload + + +def run_incremental_enrichment(args: argparse.Namespace) -> Dict[str, object]: + normalized_topic = args.topic.strip().lower() + difficulty_level = parse_difficulty(str(args.difficulty)) + source_payload = load_source_payload(args.enriched, args.semantic) + target_words = getattr(args, "words", None) + if target_words: + candidates = select_word_candidates( + source_payload, + target_words, + max(0, args.limit), + args.retry_no_match, + ) + else: + candidates = select_candidates( + source_payload, + normalized_topic, + difficulty_level, + max(0, args.limit), + args.retry_no_match, + ) + + if args.dry_run: + return { + "mode": "dry-run", + "topic": normalized_topic, + "difficulty": args.difficulty, + "selected_count": len(candidates), + "selected_words": [entry.get("form") for entry in candidates], + } + + if not args.api_key: + raise SystemExit( + f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key ." + ) + + cache = load_json(BABELNET_CACHE_PATH, {}) + if not isinstance(cache, dict): + cache = {} + babelnet_payload = load_json(args.babelnet, {"entries": []}) + if not isinstance(babelnet_payload, dict): + babelnet_payload = {"entries": []} + + enriched_candidates = [] + word_logs = [] + for index, entry in enumerate(candidates, start=1): + updated = deepcopy(entry) + updated.pop("babelnet", None) + stats = {"api_calls": 0, "cache_hits": 0, "responses": 0} + updated["babelnet"] = enrich_entry(updated, args.api_key, cache, args.sleep, stats) + enriched_candidates.append(updated) + write_json(BABELNET_CACHE_PATH, cache) + word_logs.append( + { + "word": updated["form"], + "api_calls": stats["api_calls"], + "cache_hits": stats["cache_hits"], + "responses": stats["responses"], + "matched": bool(updated["babelnet"].get("matched")), + "synsets": len(updated["babelnet"].get("synsets", []) or []), + "reason": updated["babelnet"].get("reason"), + } + ) + print( + f"[{index}/{len(candidates)}] {updated['form']}: " + f"api_calls={stats['api_calls']} cache_hits={stats['cache_hits']} " + f"risposta={stats['responses'] > 0} match={updated['babelnet'].get('matched')}" + ) + + merged_babelnet = merge_babelnet_entries( + babelnet_payload, + enriched_candidates, + normalized_topic, + str(args.difficulty), + ) + write_json(args.babelnet, merged_babelnet) + enriched_payload = rebuild_enriched(args.semantic, args.babelnet, args.enriched, normalized_topic) + + return { + "mode": "enriched", + "topic": normalized_topic, + "difficulty": args.difficulty, + "selected_count": len(candidates), + "matched_count": sum(1 for entry in enriched_candidates if entry.get("babelnet", {}).get("matched")), + "api_call_count": sum(item["api_calls"] for item in word_logs), + "cache_hit_count": sum(item["cache_hits"] for item in word_logs), + "word_logs": word_logs, + "babelnet_entry_count": merged_babelnet["meta"]["entry_count"], + "enriched_status_counts": enriched_payload["meta"]["babelnet_status_counts"], + } + + +def main() -> None: + args = parse_args() + result = run_incremental_enrichment(args) + if result["mode"] == "dry-run": + print("Dry-run BabelNet incrementale") + print(f"Topic: {result['topic']}") + print(f"Difficolta: {result['difficulty']}") + print(f"Parole selezionate: {result['selected_count']}") + for index, word in enumerate(result["selected_words"], start=1): + print(f"{index:2d}. {word}") + return + + print("Arricchimento BabelNet completato") + print(f"Topic: {result['topic']}") + print(f"Parole interrogate: {result['selected_count']}") + print(f"Chiamate API BabelNet reali: {result['api_call_count']}") + print(f"Risposte da cache: {result['cache_hit_count']}") + print(f"Match BabelNet: {result['matched_count']}") + for item in result["word_logs"]: + print( + f"- {item['word']}: api_calls={item['api_calls']}, " + f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, " + f"match={item['matched']}, synsets={item['synsets']}" + ) + print(f"Voci BabelNet archiviate: {result['babelnet_entry_count']}") + print(f"Stati lessico arricchito: {result['enriched_status_counts']}") + + +if __name__ == "__main__": + main() diff --git a/build_babelnet_enrichment.py b/build_babelnet_enrichment.py index cc45a78..f93014a 100644 --- a/build_babelnet_enrichment.py +++ b/build_babelnet_enrichment.py @@ -12,11 +12,11 @@ from pathlib import Path from typing import Dict, Iterable, List, Optional from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH -from main import parse_difficulty BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json") BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json") +BABELNET_LOCAL_KEY_PATH = Path(__file__).with_name(".babelnet_api_key.local") BABELNET_API_BASE = "https://babelnet.io/v9" BABELNET_ENV_KEY = "BABELNET_API_KEY" @@ -28,14 +28,76 @@ POS_TO_BABELNET = { } +class BabelNetApiCallLimitReached(RuntimeError): + pass + + +class BabelNetKeyUnavailable(RuntimeError): + pass + +DIFFICULTY_ALIASES: Dict[str, int] = { + "easy": 1, + "medium": 2, + "hard": 4, + "expert": 5, +} + + +def parse_difficulty(value: str) -> int: + text = str(value).strip().lower() + if text in DIFFICULTY_ALIASES: + return DIFFICULTY_ALIASES[text] + try: + level = int(text) + except ValueError as exc: + raise SystemExit( + "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5." + ) from exc + if not 1 <= level <= 5: + raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.") + return level + + +def _split_api_keys(text: str) -> List[str]: + keys = [] + seen = set() + normalized = text.replace(";", "\n").replace(",", "\n") + for line in normalized.splitlines(): + key = line.strip() + if not key or key.startswith("#") or key in seen: + continue + keys.append(key) + seen.add(key) + return keys + + +def load_babelnet_api_keys() -> List[str]: + env_key = os.environ.get(BABELNET_ENV_KEY) + if env_key: + return _split_api_keys(env_key) + if BABELNET_LOCAL_KEY_PATH.exists(): + return _split_api_keys(BABELNET_LOCAL_KEY_PATH.read_text(encoding="utf-8")) + return [] + + +def load_babelnet_api_key() -> Optional[str]: + keys = load_babelnet_api_keys() + if keys: + return keys[0] + return None + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key." ) parser.add_argument( "--api-key", - default=os.environ.get(BABELNET_ENV_KEY), - help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.", + default=load_babelnet_api_key(), + help=( + f"Chiave API BabelNet. In alternativa imposta {BABELNET_ENV_KEY} " + f"o crea {BABELNET_LOCAL_KEY_PATH.name}." + ), ) parser.add_argument( "--topic", @@ -78,10 +140,29 @@ def write_json(path: Path, payload: object) -> None: path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") -def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object]) -> object: +def cache_key(endpoint: str, params: Dict[str, str]) -> str: + safe_params = {key: value for key, value in params.items() if key != "key"} + return f"{endpoint}?{urllib.parse.urlencode(sorted(safe_params.items()))}" + + +def request_json( + endpoint: str, + params: Dict[str, str], + cache: Dict[str, object], + stats: Optional[Dict[str, int]] = None, +) -> object: url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}" - if url in cache: - return cache[url] + key = cache_key(endpoint, params) + if key in cache: + if stats is not None: + stats["cache_hits"] = stats.get("cache_hits", 0) + 1 + return cache[key] + + if stats is not None: + limit = stats.get("api_call_limit") + current = stats.get("api_calls", 0) + if limit is not None and current >= limit: + raise BabelNetApiCallLimitReached("Limite chiamate API BabelNet raggiunto") request = urllib.request.Request(url, headers={"Accept": "application/json"}) try: @@ -89,9 +170,14 @@ def request_json(endpoint: str, params: Dict[str, str], cache: Dict[str, object] payload = json.loads(response.read().decode("utf-8")) except urllib.error.HTTPError as exc: detail = exc.read().decode("utf-8", errors="replace") + if exc.code == 403: + raise BabelNetKeyUnavailable(f"Chiave BabelNet non valida o limite giornaliero raggiunto: {detail}") from exc raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc - cache[url] = payload + cache[key] = payload + if stats is not None: + stats["api_calls"] = stats.get("api_calls", 0) + 1 + stats["responses"] = stats.get("responses", 0) + 1 return payload @@ -180,7 +266,13 @@ def dedupe(items: Iterable[str]) -> List[str]: return result -def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object], sleep_seconds: float) -> Dict[str, object]: +def enrich_entry( + entry: Dict[str, object], + api_key: str, + cache: Dict[str, object], + sleep_seconds: float, + stats: Optional[Dict[str, int]] = None, +) -> Dict[str, object]: word = str(entry.get("form", "")) pos = POS_TO_BABELNET.get(str(entry.get("pos", ""))) if not pos: @@ -195,6 +287,7 @@ def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object "key": api_key, }, cache, + stats, ) if sleep_seconds: time.sleep(sleep_seconds) @@ -215,6 +308,7 @@ def enrich_entry(entry: Dict[str, object], api_key: str, cache: Dict[str, object "key": api_key, }, cache, + stats, ) if sleep_seconds: time.sleep(sleep_seconds) diff --git a/build_enriched_lexicon.py b/build_enriched_lexicon.py new file mode 100644 index 0000000..3de1ddb --- /dev/null +++ b/build_enriched_lexicon.py @@ -0,0 +1,324 @@ +from __future__ import annotations + +import argparse +import json +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Tuple + +from build_babelnet_enrichment import BABELNET_OUTPUT_PATH +from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH + + +ENRICHED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_enriched.json") + +TOPIC_DOMAIN_RULES: Dict[str, Dict[str, Tuple[str, ...]]] = { + "transport": { + "strong": ( + "TRANSPORT_AND_TRAVEL", + "NAVIGATION_AND_AVIATION", + ), + "weak": ( + "CRAFT_ENGINEERING_AND_TECHNOLOGY", + "FARMING_FISHING_AND_HUNTING", + ), + "negative": ( + "MEDIA_AND_PRESS", + "PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR", + "RELIGION_MYSTICISM_AND_MYTHOLOGY", + "CHEMISTRY_AND_MINERALOGY", + ), + }, + "health": { + "strong": ("HEALTH_AND_MEDICINE",), + "weak": ("BIOLOGY",), + "negative": ("MEDIA_AND_PRESS",), + }, + "cinema": { + "strong": ("MEDIA_AND_PRESS",), + "weak": ("ART_ARCHITECTURE_AND_ARCHAEOLOGY",), + "negative": ("HEALTH_AND_MEDICINE", "CHEMISTRY_AND_MINERALOGY"), + }, + "nature": { + "strong": ( + "BIOLOGY", + "ANIMALS", + "PLANTS", + "EARTH", + "METEOROLOGY", + ), + "weak": ("GEOGRAPHY_AND_PLACES",), + "negative": ("MEDIA_AND_PRESS",), + }, + "ecology": { + "strong": ("BIOLOGY", "EARTH", "METEOROLOGY"), + "weak": ("GEOGRAPHY_AND_PLACES",), + "negative": ("MEDIA_AND_PRESS",), + }, +} + +TOPIC_TEXT_KEYWORDS: Dict[str, Tuple[str, ...]] = { + "transport": ( + "aereo", + "auto", + "autobus", + "barca", + "bicicletta", + "imbarcazione", + "motore", + "nave", + "pista", + "trasport", + "treno", + "veicolo", + "viaggio", + ), + "health": ("cura", "malato", "medic", "ospedale", "paziente", "salute", "soccorso"), + "cinema": ("attore", "cinema", "film", "pellicola", "regia", "spettacolo"), + "nature": ("acqua", "animale", "bosco", "fiore", "mare", "montagna", "pianta", "terra"), + "ecology": ("ambiente", "ecologia", "inquinamento", "natura", "sostenibile"), +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Fonde lexicon_it_semantic.json con gli arricchimenti BabelNet gia disponibili." + ) + parser.add_argument( + "--semantic", + type=Path, + default=SEMANTIC_LEXICON_OUTPUT_PATH, + help="Lessico semantico completo di partenza.", + ) + parser.add_argument( + "--babelnet", + type=Path, + default=BABELNET_OUTPUT_PATH, + help="File con arricchimenti BabelNet parziali.", + ) + parser.add_argument( + "--output", + type=Path, + default=ENRICHED_LEXICON_OUTPUT_PATH, + help="Lessico arricchito da generare.", + ) + parser.add_argument( + "--topic", + default=None, + help="Topic opzionale da usare per scegliere il synset BabelNet migliore.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: + form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() + pos = str(entry.get("pos") or "").strip().upper() + return form, pos + + +def dedupe(items: Iterable[str]) -> List[str]: + result = [] + seen = set() + for item in items: + text = str(item).strip() + if not text or text in seen: + continue + seen.add(text) + result.append(text) + return result + + +def topic_candidates(entry: Dict[str, object], requested_topic: Optional[str]) -> List[str]: + topics = [str(topic).lower() for topic in entry.get("topics", []) if topic] + if requested_topic: + topics.insert(0, requested_topic.lower()) + return [topic for topic in dedupe(topics) if topic != "general"] + + +def synset_text(synset: Dict[str, object]) -> str: + fields = [] + fields.extend(str(item) for item in synset.get("glosses", []) or []) + fields.extend(str(item) for item in synset.get("categories", []) or []) + fields.extend(str(item) for item in synset.get("senses", []) or []) + return " ".join(fields).lower() + + +def score_synset_for_topic(synset: Dict[str, object], topic: str) -> int: + score = 0 + domains = {str(domain).upper() for domain in synset.get("domains", []) or []} + rules = TOPIC_DOMAIN_RULES.get(topic, {}) + + score += 60 * len(domains.intersection(rules.get("strong", ()))) + score += 25 * len(domains.intersection(rules.get("weak", ()))) + score -= 35 * len(domains.intersection(rules.get("negative", ()))) + + text = synset_text(synset) + for keyword in TOPIC_TEXT_KEYWORDS.get(topic, ()): + if keyword in text: + score += 12 + + return score + + +def choose_best_synset( + babelnet: Dict[str, object], entry: Dict[str, object], requested_topic: Optional[str] +) -> Tuple[Optional[Dict[str, object]], Dict[str, int]]: + synsets = [item for item in babelnet.get("synsets", []) or [] if isinstance(item, dict)] + topics = topic_candidates(entry, requested_topic) + if not synsets: + return None, {} + + if not topics: + best_synset = synsets[0] + return { + "id": best_synset.get("id"), + "topic": None, + "topic_score": 0, + "strong_topic": False, + "senses": best_synset.get("senses", []), + "glosses": best_synset.get("glosses", []), + "categories": best_synset.get("categories", []), + "domains": best_synset.get("domains", []), + }, {} + + topic_scores: Dict[str, int] = {} + best_synset = None + best_topic = None + best_score = -10_000 + + for topic in topics: + topic_best = max(score_synset_for_topic(synset, topic) for synset in synsets) + topic_scores[topic] = topic_best + for synset in synsets: + score = score_synset_for_topic(synset, topic) + if score > best_score: + best_score = score + best_topic = topic + best_synset = synset + + if not best_synset: + return None, topic_scores + + return { + "id": best_synset.get("id"), + "topic": best_topic, + "topic_score": best_score, + "strong_topic": best_score >= 40, + "senses": best_synset.get("senses", []), + "glosses": best_synset.get("glosses", []), + "categories": best_synset.get("categories", []), + "domains": best_synset.get("domains", []), + }, topic_scores + + +def normalize_babelnet_status( + entry: Dict[str, object], babelnet_entry: Optional[Dict[str, object]], requested_topic: Optional[str] +) -> Dict[str, object]: + if not babelnet_entry: + return {"status": "not_requested"} + + raw_babelnet = babelnet_entry.get("babelnet", {}) + if not isinstance(raw_babelnet, dict): + return {"status": "api_error", "reason": "invalid_babelnet_payload"} + + if not raw_babelnet.get("matched"): + return { + "status": "no_match", + "matched": False, + "reason": raw_babelnet.get("reason", "no_synsets"), + "synsets": [], + } + + best_synset, topic_scores = choose_best_synset(raw_babelnet, entry, requested_topic) + status = "enriched" + if best_synset and int(best_synset.get("topic_score", 0)) <= 0: + status = "ambiguous" + selected_synset_id = best_synset.get("id") if best_synset else None + selected_topic = best_synset.get("topic") if best_synset else None + topic_score = int(best_synset.get("topic_score", 0)) if best_synset else 0 + strong_topic = bool(best_synset.get("strong_topic", False)) if best_synset else False + + return { + "status": status, + "matched": True, + "selected_synset_id": selected_synset_id, + "selected_topic": selected_topic, + "topic_score": topic_score, + "strong_topic": strong_topic, + "synset_refs": raw_babelnet.get("synset_refs", []), + "synsets": raw_babelnet.get("synsets", []), + "topic_scores": topic_scores, + "best_synset": best_synset, + "source_generated_at": babelnet_entry.get("babelnet_generated_at"), + } + + +def build_babelnet_index(payload: Dict[str, object]) -> Dict[Tuple[str, str], Dict[str, object]]: + index = {} + for entry in payload.get("entries", []) or []: + if not isinstance(entry, dict): + continue + index[entry_key(entry)] = entry + return index + + +def build_enriched_lexicon(args: argparse.Namespace) -> Dict[str, object]: + semantic_payload = load_json(args.semantic, {}) + if not isinstance(semantic_payload, dict) or "entries" not in semantic_payload: + raise ValueError(f"Lessico semantico non valido: {args.semantic}") + + babelnet_payload = load_json(args.babelnet, {"entries": []}) + if not isinstance(babelnet_payload, dict): + babelnet_payload = {"entries": []} + + babelnet_index = build_babelnet_index(babelnet_payload) + enriched_entries = [] + status_counts: Dict[str, int] = {} + + for entry in semantic_payload.get("entries", []) or []: + if not isinstance(entry, dict): + continue + enriched = deepcopy(entry) + babelnet_entry = babelnet_index.get(entry_key(enriched)) + enriched["babelnet"] = normalize_babelnet_status(enriched, babelnet_entry, args.topic) + status = str(enriched["babelnet"].get("status", "unknown")) + status_counts[status] = status_counts.get(status, 0) + 1 + enriched_entries.append(enriched) + + return { + "meta": { + "language": semantic_payload.get("meta", {}).get("language", "it"), + "version": 1, + "base_lexicon": args.semantic.name, + "babelnet_source": args.babelnet.name if args.babelnet.exists() else None, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "requested_topic": args.topic, + "entry_count": len(enriched_entries), + "babelnet_status_counts": status_counts, + }, + "entries": enriched_entries, + } + + +def main() -> None: + args = parse_args() + payload = build_enriched_lexicon(args) + write_json(args.output, payload) + print(f"Lessico arricchito generato: {args.output}") + print(f"Voci totali: {payload['meta']['entry_count']}") + print(f"Stati BabelNet: {payload['meta']['babelnet_status_counts']}") + + +if __name__ == "__main__": + main() diff --git a/build_llm_rescue_patch.py b/build_llm_rescue_patch.py new file mode 100644 index 0000000..28206b4 --- /dev/null +++ b/build_llm_rescue_patch.py @@ -0,0 +1,429 @@ +from __future__ import annotations + +import argparse +import json +import os +import time +import urllib.error +import urllib.request +from datetime import datetime +from pathlib import Path +from typing import Any, Dict, List, Optional + + +PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json") +PATCH_OUTPUT_PATH = Path(__file__).with_name("llm_rescue_patch.json") + + +SYSTEM_PROMPT = """Sei un lessicografo italiano che prepara definizioni sintetiche per cruciverba. +Ricevi un lemma con parte del discorso e contesto semantico parziale. +Devi proporre una definizione breve in italiano, topic plausibili e tag semantici. + +Regole: +- Rispondi solo con JSON valido. +- La definizione deve essere concisa, naturale e utile per un cruciverba. +- Evita di includere il lemma o derivati ovvi del lemma nella definizione. +- Se il termine sembra raro, ambiguo, refuso o poco affidabile, abbassa la confidenza e segnala needs_human_review=true. +- I topic devono essere pochi, in inglese semplice minuscolo con underscore se serve. +- I semantic_tags devono essere pochi, descrittivi e in italiano o inglese semplice. +- Non inventare dettagli enciclopedici troppo specifici se non supportati dal contesto. + +Formato JSON obbligatorio: +{ + "definition": "...", + "topics": ["topic1", "topic2"], + "semantic_tags": ["tag1", "tag2"], + "confidence": 0.0, + "needs_human_review": true, + "notes": "..." +} +""" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Costruisce una patch di rescue lessicale usando un LLM su un lotto di voci " + "prioritarie tratte da to_be_review_priority.json." + ) + ) + parser.add_argument( + "--input", + type=Path, + default=PRIORITY_INPUT_PATH, + help="File to_be_review_priority.json di partenza.", + ) + parser.add_argument( + "--output", + type=Path, + default=PATCH_OUTPUT_PATH, + help="Patch JSON da generare o aggiornare.", + ) + parser.add_argument( + "--limit", + type=int, + default=50, + help="Numero massimo di voci da processare nel lotto. Usa 0 per tutte le voci selezionate.", + ) + parser.add_argument( + "--bucket", + default="red", + help="Bucket di priorita da considerare: red, orange, yellow oppure all.", + ) + parser.add_argument( + "--provider", + choices=("openai_compatible", "ollama"), + default="openai_compatible", + help="Tipo di endpoint LLM da usare.", + ) + parser.add_argument( + "--api-base", + default="", + help=( + "Endpoint API. Per openai_compatible: .../v1/chat/completions. " + "Per ollama: .../api/chat." + ), + ) + parser.add_argument( + "--api-key-env", + default="OPENAI_API_KEY", + help="Nome della variabile d'ambiente che contiene la API key.", + ) + parser.add_argument( + "--model", + default="gpt-4.1-mini", + help="Nome del modello da interrogare.", + ) + parser.add_argument( + "--temperature", + type=float, + default=0.2, + help="Temperatura della richiesta LLM.", + ) + parser.add_argument( + "--sleep", + type=float, + default=0.5, + help="Pausa tra una richiesta e la successiva.", + ) + parser.add_argument( + "--skip-existing", + action="store_true", + help="Salta le voci gia presenti nell'output con status drafted/reviewed/done.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Non chiama alcun LLM: prepara solo il lotto e marca le voci come selected.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def build_record(entry: Dict[str, Any]) -> Dict[str, Any]: + wiktextract = entry.get("wiktextract") or {} + wiktextract_defs = wiktextract.get("definitions") if isinstance(wiktextract, dict) else [] + babelnet_best = entry.get("babelnet_best_synset") or {} + babelnet_glosses = babelnet_best.get("glosses") if isinstance(babelnet_best, dict) else [] + return { + "form": entry.get("form"), + "lemma": entry.get("lemma"), + "pos": entry.get("pos"), + "priority_bucket": entry.get("priority_bucket"), + "priority_score": entry.get("priority_score"), + "review_reasons": entry.get("review_reasons", []), + "current_topics": entry.get("topics", []), + "current_definition": entry.get("preferred_definition", ""), + "current_source": entry.get("preferred_source", ""), + "context": { + "topic_suggestions": entry.get("topic_suggestions", []), + "semantic_glosses": entry.get("semantic_glosses", []), + "senses": entry.get("senses", []), + "wiktextract_definitions": wiktextract_defs or [], + "wiktextract_topic_hints": wiktextract.get("topic_hints", []) if isinstance(wiktextract, dict) else [], + "babelnet_glosses": babelnet_glosses or [], + }, + "rescue_definition": "", + "rescue_source": "", + "rescue_topics": [], + "rescue_semantic_tags": [], + "rescue_notes": "", + "confidence": 0.0, + "needs_human_review": True, + "status": "pending", + } + + +def build_user_prompt(entry: Dict[str, Any]) -> str: + context = entry.get("context") or {} + payload = { + "form": entry.get("form"), + "lemma": entry.get("lemma"), + "pos": entry.get("pos"), + "current_topics": entry.get("current_topics", []), + "review_reasons": entry.get("review_reasons", []), + "current_definition": entry.get("current_definition", ""), + "context": context, + } + return ( + "Genera una proposta di rescue lessicale per questa voce italiana.\n" + "Se il termine sembra un refuso o una variante dubbia, segnalalo nelle notes.\n" + "Payload:\n" + f"{json.dumps(payload, ensure_ascii=False, indent=2)}" + ) + + +def resolve_api_base(args: argparse.Namespace) -> str: + if args.api_base: + return args.api_base + if args.provider == "ollama": + return "http://localhost:11434/api/chat" + return "https://api.openai.com/v1/chat/completions" + + +def request_openai_compatible( + api_base: str, + api_key: str, + model: str, + temperature: float, + user_prompt: str, +) -> str: + payload = { + "model": model, + "temperature": temperature, + "messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + } + request = urllib.request.Request( + api_base, + data=json.dumps(payload).encode("utf-8"), + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {api_key}", + }, + method="POST", + ) + try: + with urllib.request.urlopen(request, timeout=90) as response: + body = json.loads(response.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"OpenAI-compatible HTTP {exc.code}: {detail}") from exc + return str(body["choices"][0]["message"]["content"]).strip() + + +def request_ollama( + api_base: str, + model: str, + temperature: float, + user_prompt: str, +) -> str: + payload = { + "model": model, + "stream": False, + "options": {"temperature": temperature}, + "messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": user_prompt}, + ], + } + request = urllib.request.Request( + api_base, + data=json.dumps(payload).encode("utf-8"), + headers={"Content-Type": "application/json"}, + method="POST", + ) + try: + with urllib.request.urlopen(request, timeout=90) as response: + body = json.loads(response.read().decode("utf-8")) + except urllib.error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"Ollama HTTP {exc.code}: {detail}") from exc + return str((body.get("message") or {}).get("content", "")).strip() + + +def extract_json_object(text: str) -> Dict[str, Any]: + text = text.strip() + start = text.find("{") + end = text.rfind("}") + if start == -1 or end == -1 or end <= start: + raise ValueError("Risposta LLM senza oggetto JSON riconoscibile.") + return json.loads(text[start : end + 1]) + + +def normalize_llm_payload(payload: Dict[str, Any], model: str) -> Dict[str, Any]: + topics = payload.get("topics") + tags = payload.get("semantic_tags") + confidence = payload.get("confidence", 0.0) + return { + "rescue_definition": str(payload.get("definition", "")).strip(), + "rescue_source": f"llm_rescue:{model}", + "rescue_topics": [str(item).strip().lower() for item in (topics or []) if str(item).strip()], + "rescue_semantic_tags": [str(item).strip() for item in (tags or []) if str(item).strip()], + "rescue_notes": str(payload.get("notes", "")).strip(), + "confidence": max(0.0, min(1.0, float(confidence or 0.0))), + "needs_human_review": bool(payload.get("needs_human_review", True)), + "status": "drafted", + } + + +def should_skip_existing(entry: Dict[str, Any]) -> bool: + return str(entry.get("status", "")).lower() in {"drafted", "reviewed", "done"} + + +def generate_patch(args: argparse.Namespace) -> Dict[str, Any]: + source_payload = load_json(args.input, {"entries": []}) + if not isinstance(source_payload, dict): + raise ValueError(f"File priority non valido: {args.input}") + + output_payload = load_json(args.output, {"entries": []}) + if not isinstance(output_payload, dict): + output_payload = {"entries": []} + + existing_by_form = { + str(entry.get("form", "")).lower(): entry + for entry in output_payload.get("entries", []) or [] + if isinstance(entry, dict) and entry.get("form") + } + + bucket = str(args.bucket or "red").strip().lower() + source_entries = source_payload.get("practical_entries") or source_payload.get("entries") or [] + + max_items = int(args.limit) + unlimited = max_items <= 0 + selected: List[Dict[str, Any]] = [] + skipped_preselection = 0 + for entry in source_entries: + if not isinstance(entry, dict): + continue + if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket: + continue + form = str(entry.get("form", "")).strip().lower() + if not form: + continue + existing = existing_by_form.get(form) + if args.skip_existing and existing and should_skip_existing(existing): + skipped_preselection += 1 + continue + selected.append(entry) + if not unlimited and len(selected) >= max(1, max_items): + break + + api_base = resolve_api_base(args) + api_key = os.environ.get(args.api_key_env, "") if args.provider == "openai_compatible" else "" + if not args.dry_run and args.provider == "openai_compatible" and not api_key: + raise RuntimeError( + f"Variabile d'ambiente {args.api_key_env} non valorizzata per provider openai_compatible." + ) + + merged_records: List[Dict[str, Any]] = [] + processed = 0 + skipped_existing = 0 + for source_entry in selected: + form_key = str(source_entry.get("form", "")).strip().lower() + existing = existing_by_form.get(form_key) + record = dict(existing) if isinstance(existing, dict) else build_record(source_entry) + + if args.skip_existing and existing and should_skip_existing(existing): + skipped_existing += 1 + merged_records.append(record) + continue + + if args.dry_run: + record["status"] = "selected" + record["rescue_source"] = f"llm_rescue:{args.model}" + merged_records.append(record) + processed += 1 + continue + + user_prompt = build_user_prompt(record) + try: + if args.provider == "ollama": + raw_text = request_ollama(api_base, args.model, args.temperature, user_prompt) + else: + raw_text = request_openai_compatible( + api_base, + api_key, + args.model, + args.temperature, + user_prompt, + ) + llm_payload = extract_json_object(raw_text) + record.update(normalize_llm_payload(llm_payload, args.model)) + except (urllib.error.URLError, TimeoutError, ValueError, json.JSONDecodeError, RuntimeError) as exc: + record["rescue_source"] = f"llm_rescue:{args.model}" + record["rescue_notes"] = f"errore_llm: {exc}" + record["status"] = "error" + record["needs_human_review"] = True + merged_records.append(record) + processed += 1 + print( + f"[{processed}/{len(selected)}] {record.get('form')}: " + f"status={record.get('status')} conf={record.get('confidence', 0.0)}" + ) + if record.get("status") == "error" and record.get("rescue_notes"): + print(f" dettaglio: {record.get('rescue_notes')}") + if args.sleep > 0: + time.sleep(args.sleep) + + seen_forms = {str(item.get("form", "")).lower() for item in merged_records} + for form_key, existing in existing_by_form.items(): + if form_key not in seen_forms: + merged_records.append(existing) + + merged_records.sort( + key=lambda item: ( + {"pending": 0, "selected": 1, "error": 2, "drafted": 3, "reviewed": 4, "done": 5}.get( + str(item.get("status", "pending")), + 9, + ), + -int(item.get("priority_score", 0) or 0), + str(item.get("form", "")), + ) + ) + + return { + "meta": { + "language": "it", + "version": 1, + "base_priority": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "batch_bucket": bucket, + "batch_limit": int(args.limit), + "provider": args.provider, + "api_base": api_base, + "model": args.model, + "dry_run": bool(args.dry_run), + "entry_count": len(merged_records), + "processed_count": processed, + "skipped_existing": skipped_existing, + "skipped_preselection": skipped_preselection, + }, + "entries": merged_records, + } + + +def main() -> None: + args = parse_args() + payload = generate_patch(args) + write_json(args.output, payload) + print(f"Patch LLM rescue generata: {args.output}") + print(f"Voci nel file: {payload['meta']['entry_count']}") + print(f"Voci processate in questo run: {payload['meta']['processed_count']}") + print(f"Voci saltate per skip-existing: {payload['meta']['skipped_existing']}") + print(f"Voci escluse gia in pre-selezione: {payload['meta']['skipped_preselection']}") + + +if __name__ == "__main__": + main() diff --git a/build_review_priority.py b/build_review_priority.py new file mode 100644 index 0000000..713d27c --- /dev/null +++ b/build_review_priority.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import argparse +import json +from collections import Counter +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Tuple + + +REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json") +PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json") + +REASON_WEIGHTS = { + "no_viable_definition": 100, + "proper_noun_collision": 90, + "candidate_mentions_answer": 85, + "function_word": 80, + "very_short_word": 75, + "wiktextract_missing": 55, + "only_general_topics": 45, + "flagged_by_refined_stage": 35, + "unresolved_sense_topics": 30, + "babelnet_ambiguous": 20, +} + +SOURCE_WEIGHTS = { + "fallback": 50, + "babelnet": 18, + "semantic": 8, + "wiktextract": 6, +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Costruisce un file di review prioritizzato partendo da to_be_review.json." + ) + parser.add_argument( + "--input", + type=Path, + default=REVIEW_INPUT_PATH, + help="File to_be_review.json di partenza.", + ) + parser.add_argument( + "--output", + type=Path, + default=PRIORITY_OUTPUT_PATH, + help="File to_be_review_priority.json da generare.", + ) + parser.add_argument( + "--top", + type=int, + default=0, + help="Numero massimo di voci da tenere nel file priority. 0 = tutte.", + ) + return parser.parse_args() + + +def load_json(path: Path) -> Dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: Dict[str, object]) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]: + reasons = [str(item) for item in entry.get("review_reasons", []) or []] + source = str(entry.get("preferred_source", "")).lower() + preferred_definition = str(entry.get("preferred_definition", "")) + clue_definitions = entry.get("clue_definitions", {}) or {} + form = str(entry.get("form", "")) + + score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons) + score += SOURCE_WEIGHTS.get(source, 0) + + if not preferred_definition: + score += 40 + + clue_count = len([value for value in clue_definitions.values() if str(value).strip()]) + if clue_count == 0: + score += 20 + elif clue_count == 1: + score += 8 + + score += min(len(reasons), 5) * 3 + + if len(form) <= 2: + score -= 120 + elif len(form) == 3: + score -= 35 + + severe_count = sum( + 1 + for reason in reasons + if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"} + ) + return ( + score, + severe_count, + int(source == "fallback"), + -len(preferred_definition), + str(entry.get("form", "")), + ) + + +def priority_bucket(entry: Dict[str, object]) -> str: + reasons = {str(item) for item in entry.get("review_reasons", []) or []} + if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}): + return "red" + if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}): + return "orange" + return "yellow" + + +def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]: + score = score_tuple[0] + compact = dict(entry) + compact["priority_score"] = score + compact["priority_bucket"] = priority_bucket(entry) + return compact + + +def build_priority_review(args: argparse.Namespace) -> Dict[str, object]: + payload = load_json(args.input) + if not isinstance(payload, dict) or "entries" not in payload: + raise ValueError(f"File review non valido: {args.input}") + + entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)] + ranked = sorted( + entries, + key=priority_score, + reverse=True, + ) + + if args.top > 0: + ranked = ranked[: args.top] + + compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked] + + practical_entries = [ + item + for item in compact_entries + if len(str(item.get("form", ""))) > 2 + ] + + bucket_counter = Counter(item["priority_bucket"] for item in compact_entries) + practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries) + reason_counter = Counter() + for item in compact_entries: + for reason in item.get("review_reasons", []): + reason_counter[str(reason)] += 1 + + return { + "meta": { + "language": "it", + "version": 1, + "base_review": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "entry_count": len(compact_entries), + "bucket_counts": dict(bucket_counter), + "practical_entry_count": len(practical_entries), + "practical_bucket_counts": dict(practical_bucket_counter), + "top_reason_counts": dict(reason_counter.most_common(12)), + }, + "entries": compact_entries, + "practical_entries": practical_entries, + } + + +def main() -> None: + args = parse_args() + payload = build_priority_review(args) + write_json(args.output, payload) + print(f"Review priority generato: {args.output}") + print(f"Voci nel priority file: {payload['meta']['entry_count']}") + print(f"Bucket: {payload['meta']['bucket_counts']}") + + +if __name__ == "__main__": + main() diff --git a/build_treccani_rescue_patch.py b/build_treccani_rescue_patch.py new file mode 100644 index 0000000..b5a4f85 --- /dev/null +++ b/build_treccani_rescue_patch.py @@ -0,0 +1,153 @@ +from __future__ import annotations + +import argparse +import json +from datetime import datetime +from pathlib import Path +from typing import Dict, List + + +PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json") +PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch " + "manuale/assistita di rescue lessicale." + ) + ) + parser.add_argument( + "--input", + type=Path, + default=PRIORITY_INPUT_PATH, + help="File to_be_review_priority.json di partenza.", + ) + parser.add_argument( + "--output", + type=Path, + default=PATCH_OUTPUT_PATH, + help="Patch JSON da generare o aggiornare.", + ) + parser.add_argument( + "--limit", + type=int, + default=100, + help="Numero massimo di voci da preparare nel lotto.", + ) + parser.add_argument( + "--bucket", + default="red", + help="Bucket di priorita da considerare: red, orange, yellow oppure all.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def build_record(entry: Dict[str, object]) -> Dict[str, object]: + return { + "form": entry.get("form"), + "lemma": entry.get("lemma"), + "pos": entry.get("pos"), + "priority_bucket": entry.get("priority_bucket"), + "priority_score": entry.get("priority_score"), + "review_reasons": entry.get("review_reasons", []), + "current_topics": entry.get("topics", []), + "current_definition": entry.get("preferred_definition", ""), + "current_source": entry.get("preferred_source", ""), + "rescue_definition": "", + "rescue_source": "treccani_rescue", + "rescue_topics": [], + "rescue_semantic_tags": [], + "rescue_notes": "", + "status": "pending", + } + + +def build_patch(args: argparse.Namespace) -> Dict[str, object]: + payload = load_json(args.input, {"entries": []}) + if not isinstance(payload, dict): + raise ValueError(f"File priority non valido: {args.input}") + + existing_patch = load_json(args.output, {"entries": []}) + if not isinstance(existing_patch, dict): + existing_patch = {"entries": []} + + existing_by_form = { + str(entry.get("form", "")).lower(): entry + for entry in existing_patch.get("entries", []) or [] + if isinstance(entry, dict) and entry.get("form") + } + + bucket = str(args.bucket or "red").strip().lower() + source_entries = payload.get("practical_entries") or payload.get("entries") or [] + + selected: List[Dict[str, object]] = [] + for entry in source_entries: + if not isinstance(entry, dict): + continue + if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket: + continue + form = str(entry.get("form", "")).strip().lower() + if not form: + continue + selected.append(entry) + if len(selected) >= max(1, int(args.limit)): + break + + merged_records = [] + seen = set() + for entry in selected: + form = str(entry.get("form", "")).strip().lower() + if form in existing_by_form: + merged_records.append(existing_by_form[form]) + else: + merged_records.append(build_record(entry)) + seen.add(form) + + for form, entry in existing_by_form.items(): + if form not in seen: + merged_records.append(entry) + + merged_records.sort( + key=lambda item: ( + {"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9), + -int(item.get("priority_score", 0) or 0), + str(item.get("form", "")), + ) + ) + + return { + "meta": { + "language": "it", + "version": 1, + "base_priority": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "batch_bucket": bucket, + "batch_limit": int(args.limit), + "entry_count": len(merged_records), + }, + "entries": merged_records, + } + + +def main() -> None: + args = parse_args() + payload = build_patch(args) + write_json(args.output, payload) + print(f"Patch rescue generata: {args.output}") + print(f"Voci nel lotto: {payload['meta']['entry_count']}") + + +if __name__ == "__main__": + main() diff --git a/clue_generator.py b/clue_generator.py new file mode 100644 index 0000000..6c6e4ae --- /dev/null +++ b/clue_generator.py @@ -0,0 +1,423 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json +from crossword_generator import HORIZONTAL, Placement + + +@dataclass(frozen=True) +class Clue: + number: int + word: str + direction: str + x: int + y: int + text: str + source: str + + +@dataclass(frozen=True) +class ClueCandidate: + text: str + source: str + family: str + difficulty_hint: str + topic_score: int + strong_topic: bool + + +DIFFICULTY_ALIASES = { + "1": "easy", + "2": "medium", + "3": "hard", + "4": "expert", + "5": "expert", + "easy": "easy", + "medium": "medium", + "hard": "hard", + "expert": "expert", +} + +GENERIC_CLUE_PATTERNS = ( + "termine da ricavare dagli incroci", + "termine lessicale collegato", + "collegato a:", +) + + +def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]: + payload = load_json(path, {"entries": []}) + if not isinstance(payload, dict): + return {} + return { + str(entry.get("form", "")).lower(): entry + for entry in payload.get("entries", []) or [] + if isinstance(entry, dict) and entry.get("form") + } + + +def normalize_difficulty(value: Optional[str]) -> str: + return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium") + + +def clean_definition(text: str, answer: str) -> str: + clue = str(text or "") + clue = re.sub(r"\[[^\]]*\]", " ", clue) + clue = re.sub(r"\s+", " ", clue).strip(" .;:-") + if not clue: + return "" + clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE) + clue = re.sub(r"\(\s*\)", "", clue) + clue = re.sub(r"\s+,", ",", clue) + clue = re.sub(r"\s+;", ";", clue) + if clue and clue[0].islower(): + clue = clue[0].upper() + clue[1:] + return clue + "." + + +def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool: + if not topic or topic == "general": + return True + rules = TOPIC_DOMAIN_RULES.get(topic, {}) + strong_domains = {str(domain).upper() for domain in rules.get("strong", ())} + if not strong_domains: + return True + domains = {str(domain).upper() for domain in synset.get("domains", []) or []} + return bool(domains.intersection(strong_domains)) + + +def text_contains_answer(text: str, answer: str) -> bool: + return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE)) + + +def directness_score(text: str) -> int: + lowered = text.lower() + score = 0 + direct_keywords = ( + "strumento", + "veicolo", + "animale", + "pianta", + "titolo", + "edificio", + "persona", + "luogo", + "malattia", + "farmaco", + "mezzo", + "parte di", + ) + for keyword in direct_keywords: + if keyword in lowered: + score += 8 + if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")): + score += 4 + return score + + +def preferred_length_range(difficulty: str) -> Tuple[int, int]: + if difficulty == "easy": + return 24, 90 + if difficulty == "medium": + return 20, 75 + if difficulty == "hard": + return 16, 60 + return 14, 50 + + +def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int: + text = candidate.text + lowered = text.lower() + score = 0 + + if not text or len(text) < 12: + return -10_000 + + if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS): + score -= 120 + + if text_contains_answer(text, answer): + score -= 140 + else: + score += 40 + + min_len, max_len = preferred_length_range(difficulty) + length = len(text) + if min_len <= length <= max_len: + score += 28 + else: + score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2 + + directness = directness_score(text) + if difficulty == "easy": + score += directness * 2 + elif difficulty == "medium": + score += directness + elif difficulty == "hard": + score -= max(0, directness - 6) + else: + score -= directness + + family_bonus = { + "semantic_definition": 56, + "semantic_gloss": 34, + "refined_sense": 30, + "babelnet_best_gloss": 18, + "babelnet_gloss": 10, + "fallback": 0, + } + score += family_bonus.get(candidate.family, 0) + + difficulty_pref = { + "easy": {"direct", "balanced"}, + "medium": {"balanced", "direct"}, + "hard": {"balanced", "oblique"}, + "expert": {"oblique", "balanced"}, + } + if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}): + score += 18 + + if difficulty == "easy" and ";" in text: + score += 8 + if difficulty in {"hard", "expert"} and ";" in text: + score -= 8 + + if candidate.topic_score >= 40: + score += 18 + elif candidate.topic_score > 0: + score += 8 + elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}: + score -= 140 + + if candidate.strong_topic: + score += 10 + + if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered): + score -= 28 + + if length > 120: + score -= 45 + if length > 180: + score -= 90 + + return score + + +def candidate_hint(text: str, family: str) -> str: + lowered = text.lower() + if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70: + return "direct" + if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")): + return "oblique" + if len(text) > 85: + return "direct" + return "balanced" + + +def add_candidate( + candidates: List[ClueCandidate], + seen: set[Tuple[str, str]], + *, + text: str, + answer: str, + source: str, + family: str, + topic_score: int = 0, + strong_topic: bool = False, +) -> None: + cleaned = clean_definition(text, answer) + if not cleaned: + return + key = (cleaned.lower(), family) + if key in seen: + return + seen.add(key) + candidates.append( + ClueCandidate( + text=cleaned, + source=source, + family=family, + difficulty_hint=candidate_hint(cleaned, family), + topic_score=topic_score, + strong_topic=strong_topic, + ) + ) + + +def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]: + semantic = entry.get("semantic", {}) + if not isinstance(semantic, dict): + return [] + + candidates: List[ClueCandidate] = [] + seen: set[Tuple[str, str]] = set() + + for synset in semantic.get("synsets", []) or []: + if not isinstance(synset, dict): + continue + add_candidate( + candidates, + seen, + text=str(synset.get("definition", "")), + answer=answer, + source="semantic", + family="semantic_definition", + ) + + for gloss in semantic.get("glosses", []) or []: + add_candidate( + candidates, + seen, + text=str(gloss), + answer=answer, + source="semantic", + family="semantic_gloss", + ) + + return candidates + + +def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]: + babelnet = entry.get("babelnet", {}) + if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}: + return [] + + candidates: List[ClueCandidate] = [] + seen: set[Tuple[str, str]] = set() + + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict): + topic_score = int(best_synset.get("topic_score", 0) or 0) + strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic) + for gloss in best_synset.get("glosses", []) or []: + add_candidate( + candidates, + seen, + text=str(gloss), + answer=answer, + source="babelnet", + family="babelnet_best_gloss", + topic_score=topic_score, + strong_topic=strong_topic, + ) + + for synset in babelnet.get("synsets", []) or []: + if not isinstance(synset, dict): + continue + if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic): + continue + topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0 + for gloss in synset.get("glosses", []) or []: + add_candidate( + candidates, + seen, + text=str(gloss), + answer=answer, + source="babelnet", + family="babelnet_gloss", + topic_score=topic_score, + strong_topic=topic_score >= 40, + ) + + return candidates + + +def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]: + senses = entry.get("senses", []) + if not isinstance(senses, list): + return [] + + candidates: List[ClueCandidate] = [] + seen: set[Tuple[str, str]] = set() + for sense in senses: + if not isinstance(sense, dict): + continue + confidence = float(sense.get("confidence", 0.0) or 0.0) + add_candidate( + candidates, + seen, + text=str(sense.get("definition", "")), + answer=answer, + source=str(sense.get("source", "refined")), + family="refined_sense", + topic_score=int(confidence * 100), + strong_topic=confidence >= 0.75, + ) + return candidates + + +def fallback_definition(entry: Dict[str, object], answer: str) -> str: + pos = str(entry.get("pos", "")).lower() + topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general") + if topics: + return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}." + return "Termine da ricavare dagli incroci." + + +def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]: + candidates: List[ClueCandidate] = [] + candidates.extend(semantic_candidates(entry, answer)) + candidates.extend(refined_sense_candidates(entry, answer)) + candidates.extend(babelnet_candidates(entry, answer, topic)) + return candidates + + +def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]: + ranked = sorted( + candidates, + key=lambda candidate: ( + score_candidate(candidate, answer, difficulty), + candidate.topic_score, + len(candidate.text), + ), + reverse=True, + ) + return ranked[0] if ranked else None + + +def definition_for_word( + word: str, + entries: Dict[str, Dict[str, object]], + topic: Optional[str] = None, + difficulty: Optional[str] = None, +) -> tuple[str, str]: + answer = word.lower() + entry = entries.get(answer, {}) + if not entry: + return "Termine da ricavare dagli incroci.", "fallback" + + normalized_difficulty = normalize_difficulty(difficulty) + candidates = all_candidates(entry, answer, topic) + best = choose_candidate(candidates, answer, normalized_difficulty) + if best: + return best.text, best.source + + return fallback_definition(entry, answer), "fallback" + + +def generate_clues( + placements: Iterable[Placement], + entries: Dict[str, Dict[str, object]], + topic: Optional[str] = None, + difficulty: Optional[str] = None, +) -> List[Clue]: + clues = [] + for number, placement in enumerate(placements, start=1): + text, source = definition_for_word(placement.word, entries, topic, difficulty) + direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale" + clues.append( + Clue( + number=number, + word=placement.word, + direction=direction, + x=placement.x, + y=placement.y, + text=text, + source=source, + ) + ) + return clues diff --git a/crossword_contract.md b/crossword_contract.md new file mode 100644 index 0000000..af83a6b --- /dev/null +++ b/crossword_contract.md @@ -0,0 +1,209 @@ +# Contratto JSON del Cruciverba + +Questo documento definisce il formato di scambio tra: + +- `brain`: il motore che genera e compila il cruciverba +- `client`: web app, backend, servizio PDF o altra macchina remota che richiede un cruciverba + +L'obiettivo e' avere un payload: + +- completo +- stabile +- espandibile +- riusabile per stampa PDF, gioco web e archiviazione + +## Flusso + +1. Il client invia una `request` JSON al motore. +2. Il motore risponde con una `response` JSON completa del cruciverba. +3. Lo stesso JSON di risposta puo' essere: + - salvato a database + - convertito in PDF + - renderizzato in una pagina web interattiva + - riaperto in futuro senza rigenerare il cruciverba + +## Principi di progettazione + +- Ogni cruciverba ha un `crossword_id` univoco. +- La `request` conserva i parametri di generazione originali. +- La `response` include sia la griglia giocabile sia la soluzione. +- Le parole hanno metadati ricchi: posizione, direzione, clue, tema, pos, fonte clue. +- Le coordinate sono sempre assolute e 0-based nella griglia normalizzata esportata. +- La griglia esportata e' rettangolare e normalizzata: niente coordinate negative. +- Il formato supporta versioning con `schema_version`. + +## Request + +Campi principali: + +- `schema_version`: versione del contratto +- `request_id`: id della richiesta lato client +- `requested_at`: timestamp ISO 8601 +- `generator`: configurazione del motore +- `output`: preferenze di output +- `client_context`: metadati opzionali del chiamante + +### `generator` + +- `topic`: stringa o lista di topic +- `difficulty`: alias testuale +- `seed`: opzionale, per riproducibilita' +- `initial_word_count` +- `themed_fill_count` +- `target_empty_ratio` +- `diffxy` +- `time_limit_seconds` +- `max_candidates_per_word` +- `lexicon_file` +- `definitions_enabled` +- `definition_style`: per future varianti clue +- `preferred_output_language` + +### `output` + +- `include_solution_grid` +- `include_clue_sources` +- `include_diagnostics` +- `include_generation_log` +- `format_hints` + +## Response + +Campi principali: + +- `schema_version` +- `request_id` +- `crossword_id` +- `generated_at` +- `status` +- `generator` +- `summary` +- `grid` +- `entries` +- `clues` +- `solution` +- `diagnostics` +- `artifacts` + +## Sezione `grid` + +- `rows` +- `cols` +- `cell_size_hint` +- `cells` + +Ogni cella ha: + +- `row` +- `col` +- `kind`: `block` oppure `letter` +- `solution` +- `display` +- `number`: numero clue se la cella apre una parola +- `across_entry_id` +- `down_entry_id` +- `is_prefilled` + +Note: + +- `solution` contiene sempre la lettera corretta per celle attive. +- `display` e' vuoto per la scheda giocatore. +- `number` serve per numerazione in stampa e web. + +## Sezione `entries` + +Ogni entry rappresenta una parola collocata in griglia. + +Campi: + +- `entry_id` +- `number` +- `direction`: `across` o `down` +- `answer` +- `answer_length` +- `row` +- `col` +- `cells`: lista coordinate +- `clue` +- `clue_source` +- `topics` +- `pos` +- `is_seed` +- `added_by_filler` +- `confidence` + +## Sezione `clues` + +Ridondante ma utile per client semplici. + +- `across`: lista clues orizzontali +- `down`: lista clues verticali + +Ogni clue: + +- `number` +- `entry_id` +- `text` +- `enumeration` +- `topic_match` +- `source` + +## Sezione `solution` + +- `grid_rows`: lista di stringhe, una per riga +- `words`: elenco risposte + +`grid_rows` usa: + +- lettera maiuscola per cella piena +- `#` per casella nera + +## Sezione `diagnostics` + +Serve a tuning, benchmark e debug. + +- `total_words` +- `seed_words_requested` +- `seed_words_placed` +- `filler_words_added` +- `intersections` +- `filled_cells` +- `empty_cells` +- `empty_ratio` +- `target_empty_ratio` +- `topic_words` +- `off_topic_words` +- `pos_counts` +- `runtime_lexicon` +- `seed` +- `generation_seconds` + +## Sezione `artifacts` + +URL o path futuri per file derivati. + +- `pdf_player` +- `pdf_solution` +- `thumbnail` +- `html_preview` + +## Estensioni future previste + +- `difficulty_profile`: facile/medio/difficile per definizioni separate +- `hints`: aiuti progressivi per singola parola +- `theme_story`: testo introduttivo del cruciverba +- `player_state`: salvataggio partita in corso +- `stats`: tempi, errori, percentuali di completamento + +## Regola pratica consigliata + +La macchina "brain" deve esporre almeno due endpoint logici: + +- `POST /crosswords/generate` + - input: request JSON + - output: response JSON + +- `GET /crosswords/{crossword_id}` + - output: stessa response JSON salvata + +In questo modo il contratto resta identico sia via file sia via webservice. diff --git a/crossword_contract_example_request.json b/crossword_contract_example_request.json new file mode 100644 index 0000000..e93d822 --- /dev/null +++ b/crossword_contract_example_request.json @@ -0,0 +1,37 @@ +{ + "schema_version": "1.0", + "request_id": "req-2026-04-28-0001", + "requested_at": "2026-04-28T17:05:00+02:00", + "generator": { + "topic": [ + "transport" + ], + "difficulty": "medium", + "seed": 2, + "initial_word_count": 19, + "themed_fill_count": 10, + "target_empty_ratio": 0.1667, + "diffxy": 7, + "time_limit_seconds": 8.0, + "max_candidates_per_word": 12, + "lexicon_file": "lexicon_it_curated_llm_aggressive.json", + "definitions_enabled": true, + "definition_style": "classic", + "preferred_output_language": "it" + }, + "output": { + "include_solution_grid": true, + "include_clue_sources": true, + "include_diagnostics": true, + "include_generation_log": false, + "format_hints": { + "pdf_page_size": "A4", + "mobile_layout": true + } + }, + "client_context": { + "channel": "web", + "user_locale": "it-IT", + "app_version": "alpha-1" + } +} diff --git a/crossword_contract_example_response.json b/crossword_contract_example_response.json new file mode 100644 index 0000000..c5a3558 --- /dev/null +++ b/crossword_contract_example_response.json @@ -0,0 +1,138 @@ +{ + "schema_version": "1.0", + "request_id": "req-2026-04-28-0001", + "crossword_id": "cw-2026-04-28-transport-0001", + "generated_at": "2026-04-28T17:06:42+02:00", + "status": "ok", + "generator": { + "topic": [ + "transport" + ], + "difficulty": "medium", + "seed": 2, + "runtime_lexicon": "lexicon_it_curated_llm_aggressive.json" + }, + "summary": { + "title": "Cruciverba a tema trasporti", + "subtitle": "Schema generato automaticamente", + "rows": 12, + "cols": 12, + "total_words": 6, + "intersections": 7 + }, + "grid": { + "rows": 12, + "cols": 12, + "cell_size_hint": 32, + "cells": [ + { + "row": 0, + "col": 0, + "kind": "letter", + "solution": "A", + "display": "", + "number": 1, + "across_entry_id": "A1", + "down_entry_id": null, + "is_prefilled": false + }, + { + "row": 0, + "col": 1, + "kind": "letter", + "solution": "M", + "display": "", + "number": null, + "across_entry_id": "A1", + "down_entry_id": "D2", + "is_prefilled": false + }, + { + "row": 0, + "col": 2, + "kind": "block", + "solution": null, + "display": null, + "number": null, + "across_entry_id": null, + "down_entry_id": null, + "is_prefilled": false + } + ] + }, + "entries": [ + { + "entry_id": "A1", + "number": 1, + "direction": "across", + "answer": "AMBULANZA", + "answer_length": 9, + "row": 0, + "col": 0, + "cells": [ + [0, 0], + [0, 1], + [0, 2] + ], + "clue": "Veicolo di soccorso sanitario.", + "clue_source": "semantic_definition", + "topics": [ + "transport", + "health" + ], + "pos": "NOUN", + "is_seed": true, + "added_by_filler": false, + "confidence": 0.95 + } + ], + "clues": { + "across": [ + { + "number": 1, + "entry_id": "A1", + "text": "Veicolo di soccorso sanitario.", + "enumeration": 9, + "topic_match": true, + "source": "semantic_definition" + } + ], + "down": [] + }, + "solution": { + "grid_rows": [ + "AM#ULA######", + "##B#########" + ], + "words": [ + "AMBULANZA" + ] + }, + "diagnostics": { + "seed_words_requested": 19, + "seed_words_placed": 19, + "filler_words_added": 5, + "filled_cells": 84, + "empty_cells": 18, + "empty_ratio": 0.1765, + "target_empty_ratio": 0.1667, + "topic_words": 21, + "off_topic_words": 3, + "pos_counts": { + "sostantivi": 20, + "aggettivi": 2, + "verbi": 1, + "avverbi": 0, + "preposizioni": 0, + "congiunzioni": 0, + "altri": 1 + }, + "generation_seconds": 124.6 + }, + "artifacts": { + "pdf_player": null, + "pdf_solution": null, + "thumbnail": null, + "html_preview": null + } +} diff --git a/crossword_filler.py b/crossword_filler.py index c4bf472..8a7ecb2 100644 --- a/crossword_filler.py +++ b/crossword_filler.py @@ -87,7 +87,12 @@ class CrosswordFiller: self.words_by_length = self._index_vocabulary(self.vocabulary) self.vocabulary_metadata = vocabulary_metadata or {} self.semantic_metadata = semantic_metadata or {} - self.selected_topic = selected_topic.strip().lower() + self.selected_topics = [ + topic.strip().lower() + for topic in selected_topic.split(",") + if topic.strip() + ] or ["general"] + self.selected_topic = self.selected_topics[0] self.max_themed_fill_words = max(0, max_themed_fill_words) self.seed = seed self.rng = random.Random(seed) @@ -333,7 +338,7 @@ class CrosswordFiller: return score def _semantic_topic_score(self, word: str) -> int: - if not self.selected_topic or self.selected_topic == "general": + if not self.selected_topics or self.selected_topics == ["general"]: return 0 entry = self._semantic_entry(word) @@ -350,9 +355,9 @@ class CrosswordFiller: semantic = entry.get("semantic", {}) semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])} score = 0 - if self.selected_topic in topics: + if any(topic in topics for topic in self.selected_topics): score += 4 - if self.selected_topic in semantic_topics: + if any(topic in semantic_topics for topic in self.selected_topics): score += 6 if "general" in topics: score += 1 diff --git a/curate_lexicon_alpha.py b/curate_lexicon_alpha.py new file mode 100644 index 0000000..c23201a --- /dev/null +++ b/curate_lexicon_alpha.py @@ -0,0 +1,611 @@ +from __future__ import annotations + +import argparse +import json +import re +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH + + +CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json") +TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json") + +DIFFICULTIES = ("easy", "medium", "hard", "expert") + +TEXT_REPLACEMENTS = { + " ngrandimento": " ingrandimento", + "superificie": "superficie", + "quantitaaa": "quantità", + "quantitaaaa": "quantità", + "quantit": "quantità", + "sanit_militare": "sanità_militare", + " unaparola ": " una parola ", + "questa parola, ": "", + "questa parola; ": "", +} + +SUSPICIOUS_PROPER_PATTERNS = ( + r"\bepisodio\b", + r"\bfilm\b", + r"\bserie tv\b", + r"\bfamiglia\b", + r"\bcomune italiano\b", + r"\bfrazione del comune\b", + r"\bcitta metropolitana\b", + r"\bpersonaggio\b", + r"\balbum\b", + r"\bcognome\b", + r"\bnome proprio\b", +) + +DOMAIN_HINTS = { + "religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"), + "transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"), + "health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"), + "nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"), + "geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"), + "sea": ("acque", "salate", "superficie terrestre", "oceano"), +} + +ABSTRACT_PATTERNS = ( + r"\bgrande quantita\b", + r"\bfigurato\b", + r"\bsenso figurato\b", +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json." + ) + parser.add_argument( + "--input", + type=Path, + default=REFINED_LEXICON_OUTPUT_PATH, + help="Lessico refined di partenza.", + ) + parser.add_argument( + "--output", + type=Path, + default=CURATED_LEXICON_OUTPUT_PATH, + help="Lessico curated da generare.", + ) + parser.add_argument( + "--review-output", + type=Path, + default=TO_BE_REVIEW_OUTPUT_PATH, + help="File JSON con le voci che richiedono revisione umana.", + ) + parser.add_argument( + "--max-review", + type=int, + default=0, + help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.", + ) + return parser.parse_args() + + +def load_json(path: Path) -> Dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: Dict[str, object]) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def dedupe(items: Iterable[str]) -> List[str]: + result: List[str] = [] + seen = set() + for item in items: + text = str(item).strip() + if not text: + continue + key = text.lower() + if key in seen: + continue + seen.add(key) + result.append(text) + return result + + +def ascii_fold(text: str) -> str: + replacements = str.maketrans( + { + "à": "a", + "á": "a", + "è": "e", + "é": "e", + "ì": "i", + "í": "i", + "ò": "o", + "ó": "o", + "ù": "u", + "ú": "u", + } + ) + return str(text).translate(replacements) + + +def normalize_text(text: str) -> str: + value = str(text or "").strip() + if not value: + return "" + for old, new in TEXT_REPLACEMENTS.items(): + value = value.replace(old, new) + value = re.sub(r"\s+", " ", value) + value = re.sub(r"\s*;\s*", "; ", value) + value = re.sub(r"\s*,\s*", ", ", value) + value = value.strip(" .;:-") + if value and value[0].islower(): + value = value[0].upper() + value[1:] + return value + "." + + +def split_definition_text(text: str) -> List[str]: + value = str(text or "").strip() + if not value: + return [] + pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE) + normalized = [] + for piece in pieces: + cleaned = normalize_text(piece) + if cleaned: + normalized.append(cleaned) + return normalized + + +def entry_is_common_word(entry: Dict[str, object]) -> bool: + form = str(entry.get("form", "")) + return bool(form) and form[:1].islower() and not (entry.get("name_tags") or []) + + +def definition_mentions_answer(text: str, answer: str) -> bool: + normalized_text = ascii_fold(text).lower() + normalized_answer = ascii_fold(answer).lower() + return bool(re.search(re.escape(normalized_answer), normalized_text)) + + +def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool: + if not entry_is_common_word(entry): + return False + lowered = ascii_fold(text).lower() + return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS) + + +def likely_abstract_detour(text: str) -> bool: + lowered = ascii_fold(text).lower() + return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS) + + +def semantic_topics(entry: Dict[str, object]) -> List[str]: + semantic = entry.get("semantic", {}) + topics = [] + if isinstance(semantic, dict): + topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or []) + wiktextract = entry.get("wiktextract", {}) + if isinstance(wiktextract, dict): + topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or []) + return dedupe(topics) + + +def lexical_topics(entry: Dict[str, object]) -> List[str]: + return [str(item).lower() for item in entry.get("topics", []) or [] if item] + + +def topic_alignment_score(text: str, entry: Dict[str, object]) -> int: + lowered = ascii_fold(text).lower() + score = 0 + topics = set(lexical_topics(entry)) | set(semantic_topics(entry)) + for topic in topics: + for hint in DOMAIN_HINTS.get(topic, ()): + if hint in lowered: + score += 16 + return score + + +def candidate_style(text: str) -> str: + lowered = ascii_fold(text).lower() + if ";" in text or len(text) > 90: + return "direct" + if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")): + return "balanced" + return "oblique" + + +def length_window(difficulty: str) -> Tuple[int, int]: + if difficulty == "easy": + return 18, 90 + if difficulty == "medium": + return 18, 78 + if difficulty == "hard": + return 14, 62 + return 12, 55 + + +def build_candidate( + text: str, + *, + source: str, + family: str, + confidence: float, + priority: int = 0, +) -> Dict[str, object]: + cleaned = normalize_text(text) + return { + "text": cleaned, + "source": source, + "family": family, + "confidence": confidence, + "style": candidate_style(cleaned), + "priority": priority, + } + + +def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]: + candidates: List[Dict[str, object]] = [] + seen = set() + + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + for index, synset in enumerate(semantic.get("synsets", []) or []): + if not isinstance(synset, dict): + continue + for piece in split_definition_text(str(synset.get("definition", ""))): + candidate = build_candidate( + piece, + source="semantic", + family="semantic_definition", + confidence=0.9, + priority=max(0, 100 - index * 12), + ) + key = (candidate["text"].lower(), candidate["family"]) + if candidate["text"] and key not in seen: + seen.add(key) + candidates.append(candidate) + for index, gloss in enumerate(semantic.get("glosses", []) or []): + for piece in split_definition_text(str(gloss)): + candidate = build_candidate( + piece, + source="semantic_gloss", + family="semantic_gloss", + confidence=0.8, + priority=max(0, 90 - index * 10), + ) + key = (candidate["text"].lower(), candidate["family"]) + if candidate["text"] and key not in seen: + seen.add(key) + candidates.append(candidate) + + for index, sense in enumerate(entry.get("senses", []) or []): + if not isinstance(sense, dict): + continue + for piece in split_definition_text(str(sense.get("definition", ""))): + source = str(sense.get("source", "refined")) + candidate = build_candidate( + piece, + source="refined" if source == "semantic" else source, + family="refined_sense", + confidence=float(sense.get("confidence", 0.7) or 0.7), + priority=max(0, 80 - index * 8), + ) + key = (candidate["text"].lower(), candidate["family"]) + if candidate["text"] and key not in seen: + seen.add(key) + candidates.append(candidate) + + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict): + confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55 + for index, gloss in enumerate(best_synset.get("glosses", []) or []): + for piece in split_definition_text(str(gloss)): + candidate = build_candidate( + piece, + source="babelnet", + family="babelnet_gloss", + confidence=confidence, + priority=max(0, 60 - index * 8), + ) + key = (candidate["text"].lower(), candidate["family"]) + if candidate["text"] and key not in seen: + seen.add(key) + candidates.append(candidate) + + wiktextract = entry.get("wiktextract", {}) + if isinstance(wiktextract, dict): + definitions = wiktextract.get("definitions", []) or [] + confidence = 0.78 if wiktextract.get("matched") else 0.45 + for index, definition in enumerate(definitions): + for piece in split_definition_text(str(definition)): + candidate = build_candidate( + piece, + source="wiktextract", + family="wiktextract_definition", + confidence=confidence, + priority=max(0, 88 - index * 9), + ) + key = (candidate["text"].lower(), candidate["family"]) + if candidate["text"] and key not in seen: + seen.add(key) + candidates.append(candidate) + + return candidates + + +def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int: + text = str(candidate["text"]) + answer = str(entry.get("form", "")).lower() + score = 0 + + source = str(candidate.get("source")) + family = str(candidate.get("family")) + confidence = float(candidate.get("confidence", 0.0) or 0.0) + + if len(text) < 12: + return -10_000 + + if definition_mentions_answer(text, answer): + score -= 140 + else: + score += 30 + + if suspicious_proper_noun_definition(text, entry): + score -= 220 + + if likely_abstract_detour(text): + score -= 80 + + min_len, max_len = length_window(difficulty) + if min_len <= len(text) <= max_len: + score += 24 + else: + score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2 + + source_bonus = { + "semantic": 55, + "semantic_gloss": 40, + "babelnet": 24, + "refined": 30, + "wiktextract": 52, + } + score += source_bonus.get(source, 10) + + family_bonus = { + "semantic_definition": 30, + "semantic_gloss": 18, + "babelnet_gloss": 8, + "refined_sense": 22, + "wiktextract_definition": 28, + } + score += family_bonus.get(family, 0) + + score += int(candidate.get("priority", 0) or 0) + score += int(confidence * 35) + + alignment = topic_alignment_score(text, entry) + score += alignment + topical = set(lexical_topics(entry)) | set(semantic_topics(entry)) + concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"}) + if concrete_topics and alignment == 0: + score -= 45 + + style = str(candidate.get("style")) + if difficulty == "easy" and style == "direct": + score += 16 + elif difficulty == "medium" and style in {"direct", "balanced"}: + score += 14 + elif difficulty == "hard" and style == "balanced": + score += 10 + elif difficulty == "expert" and style == "oblique": + score += 10 + + if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()): + score -= 30 + if difficulty in {"hard", "expert"} and ";" in text: + score -= 10 + + if entry.get("needs_review"): + score -= 8 + + return score + + +def choose_best_candidate( + candidates: Sequence[Dict[str, object]], + entry: Dict[str, object], + difficulty: str, +) -> Optional[Dict[str, object]]: + ranked = sorted( + candidates, + key=lambda candidate: ( + score_candidate(candidate, entry, difficulty), + float(candidate.get("confidence", 0.0)), + float(candidate.get("priority", 0.0)), + -len(str(candidate.get("text", ""))), + ), + reverse=True, + ) + return ranked[0] if ranked else None + + +def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]: + reasons: List[str] = [] + form = str(entry.get("form", "")) + lowered_topics = set(lexical_topics(entry)) + semantic_topic_set = set(semantic_topics(entry)) + babelnet_status = str((entry.get("babelnet") or {}).get("status", "")) + wiktextract = entry.get("wiktextract", {}) + wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else "" + preferred_definition = str(entry.get("preferred_definition", "")) + preferred_source = str(entry.get("preferred_source", "")) + + if not candidates: + reasons.append("no_viable_definition") + if not preferred_definition and entry.get("needs_review"): + reasons.append("flagged_by_refined_stage") + if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry): + reasons.append("proper_noun_collision") + if babelnet_status == "ambiguous" and preferred_source == "babelnet": + reasons.append("babelnet_ambiguous") + if wiktextract_status in {"missing", "no_match"} and not preferred_definition: + reasons.append("wiktextract_missing") + if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition: + reasons.append("only_general_topics") + if len(form) <= 2: + reasons.append("very_short_word") + if str(entry.get("pos", "")) in {"PREP", "CONJ"}: + reasons.append("function_word") + if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)): + reasons.append("unresolved_sense_topics") + if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()): + reasons.append("candidate_mentions_answer") + + return dedupe(reasons) + + +def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]: + curated = deepcopy(entry) + candidates = collect_candidates(curated) + + clue_definitions: Dict[str, str] = {} + clue_sources: Dict[str, str] = {} + clue_scores: Dict[str, int] = {} + curation_notes: List[str] = [] + + for difficulty in DIFFICULTIES: + best = choose_best_candidate(candidates, curated, difficulty) + if best: + clue_definitions[difficulty] = str(best["text"]) + clue_sources[difficulty] = str(best["source"]) + clue_scores[difficulty] = score_candidate(best, curated, difficulty) + + preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or "" + preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback" + + if preferred_definition: + curation_notes.append(f"preferred_from={preferred_source}") + if clue_scores.get("medium", -9999) < 20: + curation_notes.append("weak_medium_definition") + + curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates) + curated["curated_senses"] = [ + { + "definition": candidate["text"], + "source": candidate["source"], + "family": candidate["family"], + "confidence": candidate["confidence"], + "priority": candidate["priority"], + } + for candidate in candidates + ] + curated["preferred_definition"] = preferred_definition + curated["preferred_source"] = preferred_source + curated["clue_definitions"] = clue_definitions + curated["clue_sources"] = clue_sources + curated["clue_scores"] = clue_scores + curated["curation_notes"] = curation_notes + + reasons = review_reasons(curated, candidates) + severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"} + alpha_ready = bool(preferred_definition) and not severe.intersection(reasons) + curated["alpha_ready"] = alpha_ready + curated["review_reasons"] = reasons + + review_item = None + if reasons: + review_item = { + "form": curated.get("form"), + "lemma": curated.get("lemma"), + "pos": curated.get("pos"), + "topics": curated.get("topics"), + "topic_suggestions": curated.get("topic_suggestions"), + "preferred_definition": preferred_definition, + "preferred_source": preferred_source, + "clue_definitions": clue_definitions, + "review_reasons": reasons, + "semantic_glosses": (curated.get("semantic") or {}).get("glosses", []), + "senses": curated.get("senses", []), + "babelnet_status": (curated.get("babelnet") or {}).get("status"), + "babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"), + "wiktextract_status": (curated.get("wiktextract") or {}).get("status"), + "wiktextract": curated.get("wiktextract"), + "candidate_pool": [ + { + "text": candidate["text"], + "source": candidate["source"], + "family": candidate["family"], + "confidence": candidate["confidence"], + "priority": candidate["priority"], + } + for candidate in candidates[:12] + ], + } + + return curated, review_item + + +def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]: + payload = load_json(args.input) + if not isinstance(payload, dict) or "entries" not in payload: + raise ValueError(f"Lessico refined non valido: {args.input}") + + curated_entries: List[Dict[str, object]] = [] + review_entries: List[Dict[str, object]] = [] + + for entry in payload.get("entries", []) or []: + if not isinstance(entry, dict): + continue + curated, review_item = curate_entry(entry) + curated_entries.append(curated) + if review_item: + review_entries.append(review_item) + + if args.max_review > 0: + review_entries = review_entries[: args.max_review] + + curated_payload = { + "meta": { + "language": "it", + "version": 1, + "base_lexicon": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "entry_count": len(curated_entries), + "alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")), + "review_count": len(review_entries), + }, + "entries": curated_entries, + } + + review_payload = { + "meta": { + "language": "it", + "version": 1, + "base_lexicon": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "entry_count": len(review_entries), + }, + "entries": review_entries, + } + + return curated_payload, review_payload + + +def main() -> None: + args = parse_args() + curated_payload, review_payload = build_curated_lexicon(args) + write_json(args.output, curated_payload) + write_json(args.review_output, review_payload) + print(f"Lessico curated generato: {args.output}") + print(f"Voci totali: {curated_payload['meta']['entry_count']}") + print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}") + print(f"Voci da revisionare: {review_payload['meta']['entry_count']}") + print(f"File review generato: {args.review_output}") + + +if __name__ == "__main__": + main() diff --git a/enrich_review_from_wiktextract_file.py b/enrich_review_from_wiktextract_file.py new file mode 100644 index 0000000..08d9031 --- /dev/null +++ b/enrich_review_from_wiktextract_file.py @@ -0,0 +1,492 @@ +from __future__ import annotations + +import argparse +import json +import re +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH + + +REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json") +WIKTEXTRACT_INPUT_PATH = Path(__file__).with_name("raw-wiktextract-data.jsonl") +WIKTEXTRACT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktextract.json") +WIKTEXTRACT_INDEX_CACHE_PATH = Path(__file__).with_name(".wiktextract_it_index.json") + +DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"} + +POS_MAP = { + "noun": "NOUN", + "adj": "ADJ", + "adj": "ADJ", + "verb": "VERB", + "adv": "ADV", + "prep": "PREP", + "conj": "CONJ", + "pron": "PRON", + "intj": "INTJ", +} + +TOPIC_MAP = { + "christianity": "religion", + "religion": "religion", + "history": "history", + "agriculture": "agriculture", + "engineering": "technology", + "mechanics": "technology", + "technology": "technology", + "medicine": "health", + "geography": "geography", + "biology": "nature", + "aeronautics": "transport", +} + +CATEGORY_TOPIC_HINTS = { + "religione-it": "religion", + "cristianesimo-it": "religion", + "storia-it": "history", + "agricoltura-it": "agriculture", + "medicina-it": "health", + "ingegneria-it": "technology", + "meccanica-it": "technology", + "tecnologia-it": "technology", + "geografia-it": "geography", + "biologia-it": "nature", + "aeronautica-it": "transport", +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Arricchisce il lessico refined leggendo offline il file raw-wiktextract-data.jsonl, " + "senza effettuare richieste di rete." + ) + ) + parser.add_argument( + "--input", + type=Path, + default=REFINED_LEXICON_OUTPUT_PATH, + help="Lessico refined di partenza.", + ) + parser.add_argument( + "--review", + type=Path, + default=REVIEW_INPUT_PATH, + help="File to_be_review.json da usare per selezionare i lemmi prioritari.", + ) + parser.add_argument( + "--wiktextract", + type=Path, + default=WIKTEXTRACT_INPUT_PATH, + help="File JSONL raw estratto da Wiktionary.", + ) + parser.add_argument( + "--output", + type=Path, + default=WIKTEXTRACT_OUTPUT_PATH, + help="Lessico refined con blocco wiktextract aggiunto.", + ) + parser.add_argument( + "--index-cache", + type=Path, + default=WIKTEXTRACT_INDEX_CACHE_PATH, + help="Cache dell'indice lemmi->righe del JSONL per velocizzare i rilanci.", + ) + parser.add_argument( + "--word-limit", + type=int, + default=0, + help="Limite massimo di parole da elaborare. 0 = tutte le candidate.", + ) + parser.add_argument( + "--words", + default="", + help="Lista separata da virgole di lemmi specifici da arricchire.", + ) + parser.add_argument( + "--review-reasons", + default=",".join(sorted(DEFAULT_REVIEW_REASONS)), + help="Motivi del file review da trattare con priorita, separati da virgole.", + ) + parser.add_argument( + "--skip-existing", + action="store_true", + help="Salta le voci che nel lessico di input hanno gia un blocco wiktextract utile.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def parse_csv_set(value: str) -> set[str]: + return {item.strip().lower() for item in str(value or "").split(",") if item.strip()} + + +def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: + form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() + pos = str(entry.get("pos") or "").strip().upper() + return form, pos + + +def load_or_build_index(jsonl_path: Path, index_cache_path: Path) -> Dict[str, List[int]]: + cached = load_json(index_cache_path, {}) + expected_meta = { + "source": str(jsonl_path.resolve()), + "size": jsonl_path.stat().st_size if jsonl_path.exists() else 0, + "mtime": jsonl_path.stat().st_mtime if jsonl_path.exists() else 0, + } + if ( + isinstance(cached, dict) + and cached.get("meta") == expected_meta + and isinstance(cached.get("index"), dict) + ): + return {str(key): list(value) for key, value in cached["index"].items()} + + index: Dict[str, List[int]] = {} + with jsonl_path.open("r", encoding="utf-8") as handle: + while True: + offset = handle.tell() + line = handle.readline() + if not line: + break + raw = line.rstrip("\n") + if not raw: + continue + obj = json.loads(raw) + if obj.get("lang_code") != "it": + continue + word = str(obj.get("word", "")).strip().lower() + if word: + index.setdefault(word, []).append(offset) + + write_json(index_cache_path, {"meta": expected_meta, "index": index}) + return index + + +def read_jsonl_objects_at_offsets(jsonl_path: Path, offsets: Sequence[int]) -> List[Dict[str, object]]: + objects: List[Dict[str, object]] = [] + with jsonl_path.open("r", encoding="utf-8") as handle: + for offset in offsets: + handle.seek(offset) + line = handle.readline() + if not line: + continue + objects.append(json.loads(line)) + return objects + + +def map_pos(value: str) -> str: + normalized = str(value or "").strip().lower() + return POS_MAP.get(normalized, normalized.upper() if normalized else "") + + +def normalize_text(text: str) -> str: + value = str(text or "").strip() + value = re.sub(r"\s+", " ", value) + return value + + +def sense_topics(sense: Dict[str, object], categories: Sequence[str]) -> List[str]: + topics = set() + for topic in sense.get("topics", []) or []: + normalized = TOPIC_MAP.get(str(topic).strip().lower()) + if normalized: + topics.add(normalized) + for category in categories: + normalized = CATEGORY_TOPIC_HINTS.get(str(category).strip().lower()) + if normalized: + topics.add(normalized) + return sorted(topics) + + +def word_level_topics(entries: Sequence[Dict[str, object]], categories: Sequence[str]) -> List[str]: + topics = set() + for entry in entries: + for sense in entry.get("senses", []) or []: + if isinstance(sense, dict): + topics.update(sense_topics(sense, categories)) + return sorted(topics) + + +def grammar_hints(entries: Sequence[Dict[str, object]]) -> List[str]: + hints = set() + for entry in entries: + pos = str(entry.get("pos", "")).lower() + tags = [str(tag).lower() for tag in entry.get("tags", []) or []] + if pos == "verb" and "form-of" in tags: + hints.add("voce_verbale") + if pos == "noun": + for sense in entry.get("senses", []) or []: + if not isinstance(sense, dict): + continue + for gloss in sense.get("glosses", []) or []: + gloss_text = str(gloss).lower() + if "diminutivo" in gloss_text: + hints.add("diminutivo") + if "accrescitivo" in gloss_text: + hints.add("accrescitivo") + if "peggiorativo" in gloss_text: + hints.add("peggiorativo") + for sense in entry.get("senses", []) or []: + if not isinstance(sense, dict): + continue + for gloss in sense.get("glosses", []) or []: + gloss_text = str(gloss).lower() + if "congiuntivo" in gloss_text: + hints.add("congiuntivo") + if "imperativo" in gloss_text: + hints.add("imperativo") + if "plurale" in gloss_text: + hints.add("plurale") + return sorted(hints) + + +def simplify_entry(obj: Dict[str, object]) -> Dict[str, object]: + categories = [normalize_text(item) for item in obj.get("categories", []) or [] if item] + senses = [] + for sense in obj.get("senses", []) or []: + if not isinstance(sense, dict): + continue + glosses = [normalize_text(item) for item in sense.get("glosses", []) or [] if normalize_text(item)] + if not glosses: + continue + senses.append( + { + "glosses": glosses, + "examples": [ + normalize_text(example.get("text", "")) + for example in sense.get("examples", []) or [] + if isinstance(example, dict) and normalize_text(example.get("text", "")) + ], + "topics": sense_topics(sense, categories), + "tags": [str(tag) for tag in sense.get("tags", []) or [] if tag], + "categories": [normalize_text(item) for item in sense.get("categories", []) or [] if item], + } + ) + return { + "word": obj.get("word"), + "lang": obj.get("lang"), + "lang_code": obj.get("lang_code"), + "pos": map_pos(str(obj.get("pos", ""))), + "pos_title": obj.get("pos_title"), + "tags": [str(tag) for tag in obj.get("tags", []) or [] if tag], + "categories": categories, + "senses": senses, + "synonyms": [item for item in obj.get("synonyms", []) or [] if isinstance(item, dict) and item.get("word")], + "related": [item for item in obj.get("related", []) or [] if isinstance(item, dict) and item.get("word")], + } + + +def choose_best_entries(refined_entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[Dict[str, object]]: + target_pos = str(refined_entry.get("pos", "")).upper() + exact = [candidate for candidate in candidates if str(candidate.get("pos", "")).upper() == target_pos] + if exact: + return exact + return list(candidates) + + +def wiktextract_already_useful(entry: Dict[str, object]) -> bool: + payload = entry.get("wiktextract", {}) + if not isinstance(payload, dict): + return False + status = str(payload.get("status", "")).lower() + if status == "enriched" and (payload.get("definitions") or payload.get("entries")): + return True + if status in {"missing", "no_match"}: + return True + return False + + +def select_targets( + refined_payload: Dict[str, object], + review_payload: Dict[str, object], + review_reasons: set[str], + explicit_words: set[str], + word_limit: int, + skip_existing: bool, +) -> Tuple[List[Dict[str, object]], int]: + refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)] + refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")} + + if explicit_words: + selected = [] + skipped_existing_count = 0 + for word in explicit_words: + entry = refined_by_word.get(word) + if entry is None: + continue + if skip_existing and wiktextract_already_useful(entry): + skipped_existing_count += 1 + continue + selected.append(entry) + selected = selected[:word_limit] if word_limit > 0 else selected + return selected, skipped_existing_count + + review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)] + selected_words: List[str] = [] + seen = set() + skipped_existing_count = 0 + + for review_entry in review_entries: + word = str(review_entry.get("form", "")).strip().lower() + if not word or word in seen: + continue + reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []} + refined = refined_by_word.get(word) + if refined is None: + continue + if skip_existing and wiktextract_already_useful(refined): + skipped_existing_count += 1 + continue + babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower() + if reasons.intersection(review_reasons) or babelnet_status == "no_match": + selected_words.append(word) + seen.add(word) + if word_limit > 0 and len(selected_words) >= word_limit: + break + + return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count + + +def wiktextract_payload_for_entry(refined_entry: Dict[str, object], matches: Sequence[Dict[str, object]]) -> Dict[str, object]: + if not matches: + return { + "status": "missing", + "matched": False, + "definitions": [], + "entries": [], + "topic_hints": [], + "grammar_hints": [], + } + + selected_entries = choose_best_entries(refined_entry, matches) + definitions = [] + for item in selected_entries: + for sense in item.get("senses", []) or []: + if not isinstance(sense, dict): + continue + definitions.extend(sense.get("glosses", []) or []) + definitions = [normalize_text(item) for item in definitions if normalize_text(item)] + + all_categories = [] + for item in selected_entries: + all_categories.extend(item.get("categories", []) or []) + + return { + "status": "enriched" if definitions else "entries_without_definitions", + "matched": bool(definitions), + "definitions": definitions, + "entries": selected_entries, + "topic_hints": word_level_topics(selected_entries, all_categories), + "grammar_hints": grammar_hints(selected_entries), + "categories": sorted(set(normalize_text(item) for item in all_categories if normalize_text(item))), + } + + +def enrich_from_wiktextract(args: argparse.Namespace) -> Dict[str, object]: + refined_payload = load_json(args.input, {"entries": []}) + if not isinstance(refined_payload, dict) or "entries" not in refined_payload: + raise ValueError(f"Lessico refined non valido: {args.input}") + + review_payload = load_json(args.review, {"entries": []}) + if not isinstance(review_payload, dict): + review_payload = {"entries": []} + + targets, skipped_existing_count = select_targets( + refined_payload, + review_payload, + parse_csv_set(args.review_reasons), + parse_csv_set(args.words), + args.word_limit, + args.skip_existing, + ) + + print( + f"Target selezionati: {len(targets)}" + + (f" | già saltati per wiktextract esistente: {skipped_existing_count}" if args.skip_existing else "") + ) + + index = load_or_build_index(args.wiktextract, args.index_cache) + refined_index = { + entry_key(entry): deepcopy(entry) + for entry in refined_payload.get("entries", []) or [] + if isinstance(entry, dict) + } + + matched_count = 0 + missing_count = 0 + + for idx, entry in enumerate(targets, start=1): + updated = deepcopy(entry) + word = str(entry.get("form", "")).strip().lower() + offsets = index.get(word, []) + objects = [simplify_entry(obj) for obj in read_jsonl_objects_at_offsets(args.wiktextract, offsets)] + payload = wiktextract_payload_for_entry(updated, objects) + updated["wiktextract"] = payload + updated["wiktextract_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds") + refined_index[entry_key(updated)] = updated + + if payload.get("matched"): + matched_count += 1 + else: + missing_count += 1 + + print( + f"[{idx}/{len(targets)}] {word}: " + f"status={payload.get('status')} " + f"def={len(payload.get('definitions', []))} " + f"topics={len(payload.get('topic_hints', []))} " + f"entries={len(payload.get('entries', []))}" + ) + + merged_entries = list(refined_index.values()) + merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", "")))) + + merged_payload = { + "meta": { + **(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}), + "wiktextract_source": str(args.wiktextract), + "wiktextract_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "wiktextract_target_count": len(targets), + "wiktextract_skipped_existing_count": skipped_existing_count, + "wiktextract_matched_count": matched_count, + "wiktextract_missing_count": missing_count, + }, + "entries": merged_entries, + } + + write_json(args.output, merged_payload) + + return { + "target_count": len(targets), + "skipped_existing_count": skipped_existing_count, + "matched_count": matched_count, + "missing_count": missing_count, + "output": str(args.output), + } + + +def main() -> None: + args = parse_args() + result = enrich_from_wiktextract(args) + print(f"Lessico con Wiktextract generato: {result['output']}") + print(f"Voci trattate: {result['target_count']}") + print(f"Voci già saltate: {result['skipped_existing_count']}") + print(f"Match Wiktextract: {result['matched_count']}") + print(f"Senza match Wiktextract: {result['missing_count']}") + + +if __name__ == "__main__": + main() diff --git a/enrich_review_from_wiktionary.py b/enrich_review_from_wiktionary.py new file mode 100644 index 0000000..dc9075f --- /dev/null +++ b/enrich_review_from_wiktionary.py @@ -0,0 +1,678 @@ +from __future__ import annotations + +import argparse +import json +import re +import time +import urllib.parse +import urllib.request +import urllib.error +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Optional, Sequence, Tuple + +from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH + + +REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json") +WIKTIONARY_CACHE_PATH = Path(__file__).with_name(".wiktionary_cache.json") +WIKTIONARY_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktionary.json") +WIKTIONARY_API_URL = "https://it.wiktionary.org/w/api.php" + +DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"} + +POS_ALIASES = { + "sostantivo": "NOUN", + "nome": "NOUN", + "sost": "NOUN", + "aggettivo": "ADJ", + "agg": "ADJ", + "verbo": "VERB", + "verb": "VERB", + "verb form": "VERB_FORM", + "avverbio": "ADV", + "avv": "ADV", + "preposizione": "PREP", + "prep": "PREP", + "congiunzione": "CONJ", + "cong": "CONJ", + "pronome": "PRON", + "pron": "PRON", + "articolo": "ART", + "interiezione": "INTJ", + "inter": "INTJ", + "locuzione": "PHRASE", + "loc": "PHRASE", +} + +TOPIC_KEYWORDS = { + "religion": ("religione", "cattolic", "sacro", "sacra", "devozion", "scapolare", "abbazia", "monastero"), + "clothing": ("abito", "vestito", "vestit", "abbigliamento", "indumento", "stoffa"), + "grammar": ("diminutivo", "voce verbale", "congiuntivo", "plurale", "singolare", "grammatica", "verbo"), + "geography": ("comune", "paese", "regione", "provincia", "citta", "localita", "frazione"), + "transport": ("veicolo", "motore", "treno", "aereo", "trasporto", "nave", "imbarcazione"), + "health": ("medicina", "ospedale", "malattia", "cura", "feriti", "ammalati", "sanitario"), +} + +GRAMMAR_KEYWORDS = ( + "diminutivo", + "accrescitivo", + "peggiorativo", + "alterato", + "voce verbale", + "congiuntivo", + "participio", + "plurale", + "singolare", + "maschile", + "femminile", +) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Arricchisce le voci problematiche del lessico refined con definizioni e metadati " + "estratti da it.wiktionary.org." + ) + ) + parser.add_argument( + "--input", + type=Path, + default=REFINED_LEXICON_OUTPUT_PATH, + help="Lessico refined di partenza.", + ) + parser.add_argument( + "--review", + type=Path, + default=REVIEW_INPUT_PATH, + help="File to_be_review.json da usare per selezionare le voci prioritarie.", + ) + parser.add_argument( + "--output", + type=Path, + default=WIKTIONARY_OUTPUT_PATH, + help="Nuovo lessico con blocco wiktionary aggiunto.", + ) + parser.add_argument( + "--cache", + type=Path, + default=WIKTIONARY_CACHE_PATH, + help="Cache locale delle risposte Wiktionary.", + ) + parser.add_argument( + "--word-limit", + type=int, + default=0, + help="Limite massimo di parole da elaborare. 0 = tutte le candidate.", + ) + parser.add_argument( + "--sleep", + type=float, + default=1.0, + help="Pausa tra le richieste HTTP a Wiktionary.", + ) + parser.add_argument( + "--save-every", + type=int, + default=25, + help="Salva cache e output ogni N parole elaborate per non perdere progresso.", + ) + parser.add_argument( + "--retry-429", + type=int, + default=3, + help="Numero massimo di tentativi aggiuntivi se Wiktionary risponde HTTP 429.", + ) + parser.add_argument( + "--backoff-429", + type=float, + default=30.0, + help="Secondi di attesa iniziali dopo un HTTP 429; raddoppiano a ogni nuovo tentativo.", + ) + parser.add_argument( + "--stop-on-429", + action="store_true", + help="Se attivo, al primo HTTP 429 salva lo stato e interrompe il batch senza altri tentativi.", + ) + parser.add_argument( + "--words", + default="", + help="Lista separata da virgole di lemmi specifici da arricchire.", + ) + parser.add_argument( + "--review-reasons", + default=",".join(sorted(DEFAULT_REVIEW_REASONS)), + help="Motivi del file review da trattare con priorita, separati da virgole.", + ) + parser.add_argument( + "--api-url", + default=WIKTIONARY_API_URL, + help="Endpoint MediaWiki Action API di Wiktionary.", + ) + parser.add_argument( + "--skip-existing", + action="store_true", + help="Salta le voci che nel lessico di input hanno già un blocco wiktionary con stato utile.", + ) + return parser.parse_args() + + +def load_json(path: Path, default: object) -> object: + if not path.exists(): + return default + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: object) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def parse_csv_set(value: str) -> set[str]: + return {item.strip().lower() for item in str(value or "").split(",") if item.strip()} + + +def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: + form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() + pos = str(entry.get("pos") or "").strip().upper() + return form, pos + + +def fetch_wikitext(title: str, api_url: str) -> Dict[str, object]: + params = { + "action": "query", + "prop": "revisions", + "titles": title, + "rvprop": "content", + "rvslots": "main", + "formatversion": "2", + "format": "json", + } + url = f"{api_url}?{urllib.parse.urlencode(params)}" + request = urllib.request.Request( + url, + headers={ + "User-Agent": "cruciverba-alpha/0.1 (local lexical enrichment)", + "Accept": "application/json", + }, + ) + with urllib.request.urlopen(request, timeout=30) as response: + payload = json.loads(response.read().decode("utf-8")) + pages = ((payload.get("query") or {}).get("pages") or []) + if not pages: + return {"status": "missing"} + page = pages[0] + if page.get("missing"): + return {"status": "missing", "title": page.get("title", title)} + revisions = page.get("revisions") or [] + content = "" + if revisions: + slots = revisions[0].get("slots") or {} + main_slot = slots.get("main") or {} + content = str(main_slot.get("content") or "") + return { + "status": "ok" if content else "empty", + "title": page.get("title", title), + "pageid": page.get("pageid"), + "wikitext": content, + } + + +def fetch_wikitext_with_retry(title: str, args: argparse.Namespace) -> Dict[str, object]: + attempts = 0 + delay = max(1.0, float(args.backoff_429)) + while True: + try: + return fetch_wikitext(title, args.api_url) + except urllib.error.HTTPError as exc: + if exc.code != 429: + raise + if args.stop_on_429: + raise + if attempts >= max(0, int(args.retry_429)): + raise + attempts += 1 + print(f"[429] {title}: attendo {delay:.1f}s prima del tentativo {attempts}/{args.retry_429}") + time.sleep(delay) + delay *= 2 + + +def normalize_heading(text: str) -> str: + raw = str(text or "").strip().lower().replace(" ", "") + if raw == "{{-it-}}": + return "{{-it-}}" + cleaned = strip_wikicode(text).strip().lower() + return cleaned + + +def extract_italian_section(wikitext: str) -> str: + section_pattern = re.compile(r"^==\s*(.*?)\s*==\s*$", re.MULTILINE) + matches = list(section_pattern.finditer(wikitext)) + for index, match in enumerate(matches): + raw_heading = str(match.group(1) or "").strip().lower().replace(" ", "") + heading = normalize_heading(match.group(1)) + if raw_heading == "{{-it-}}" or heading in {"italiano", "it"}: + start = match.end() + end = matches[index + 1].start() if index + 1 < len(matches) else len(wikitext) + return wikitext[start:end] + return "" + + +def strip_templates(text: str) -> str: + previous = None + current = text + while previous != current: + previous = current + current = re.sub(r"\{\{([^{}|]+)\|([^{}]+?)\}\}", r"\2", current) + current = re.sub(r"\{\{[^{}]+\}\}", "", current) + return current + + +def strip_wikicode(text: str) -> str: + value = str(text or "") + value = re.sub(r"", " ", value, flags=re.DOTALL) + value = re.sub(r"]*>.*?", " ", value, flags=re.DOTALL) + value = re.sub(r"<[^>]+>", " ", value) + value = strip_templates(value) + value = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", value) + value = re.sub(r"\[\[([^\]]+)\]\]", r"\1", value) + value = value.replace("'''", "").replace("''", "") + value = value.replace(" ", " ") + value = re.sub(r"\s+", " ", value) + return value.strip(" .;:-") + + +def infer_topics(definitions: Sequence[str], categories: Sequence[str]) -> List[str]: + text = " ".join(definitions + list(categories)).lower() + topics = [] + for topic, keywords in TOPIC_KEYWORDS.items(): + if any(keyword in text for keyword in keywords): + topics.append(topic) + return sorted(set(topics)) + + +def infer_grammar_hints(definitions: Sequence[str], raw_section: str) -> List[str]: + text = f"{' '.join(definitions)} {raw_section}".lower() + hints = [] + for keyword in GRAMMAR_KEYWORDS: + if keyword in text: + hints.append(keyword) + return sorted(set(hints)) + + +def detect_pos_from_heading(heading: str) -> Optional[str]: + normalized = normalize_heading(heading) + if not normalized: + return None + for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True): + if label in normalized: + return pos + return None + + +def parse_template_marker(line: str) -> Tuple[Optional[str], Optional[str]]: + stripped = line.strip() + match = re.match(r"^\{\{-([^{}|]+?)-?(?:\|.*)?\}\}$", stripped, flags=re.IGNORECASE) + if not match: + return None, None + marker = match.group(1).strip().lower() + if marker == "it": + return "language", "it" + for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True): + if marker.startswith(label): + return "pos", pos + if marker.startswith("sinon"): + return "subsection", "sinonimi" + if marker.startswith(("etim", "trad", "sill", "pron", "var", "note")): + return "subsection", marker + return "subsection", marker + + +def parse_wiktionary_section(section_text: str) -> Dict[str, object]: + lines = section_text.splitlines() + entries: List[Dict[str, object]] = [] + categories: List[str] = [] + current: Optional[Dict[str, object]] = None + current_subsection = "" + + heading_pattern = re.compile(r"^(={3,4})\s*(.*?)\s*\1\s*$") + + for raw_line in lines: + line = raw_line.rstrip() + if not line: + continue + + for category_match in re.findall(r"\[\[Categoria:([^\]]+)\]\]", line): + categories.append(strip_wikicode(category_match)) + + marker_kind, marker_value = parse_template_marker(line) + if marker_kind == "pos": + current = { + "pos": marker_value, + "heading": marker_value, + "definitions": [], + "examples": [], + "synonyms": [], + } + entries.append(current) + current_subsection = "" + continue + if marker_kind == "subsection": + current_subsection = str(marker_value or "") + continue + + heading_match = heading_pattern.match(line) + if heading_match: + level = len(heading_match.group(1)) + heading = heading_match.group(2) + if level == 3: + pos = detect_pos_from_heading(heading) + if pos: + current = { + "pos": pos, + "heading": strip_wikicode(heading), + "definitions": [], + "examples": [], + "synonyms": [], + } + entries.append(current) + current_subsection = "" + continue + current_subsection = normalize_heading(heading) + continue + + if current is None: + continue + + stripped = line.lstrip() + if stripped.startswith("#") and not stripped.startswith(("#:", "#*", "#;")): + definition = strip_wikicode(stripped.lstrip("#").strip()) + if definition: + current["definitions"].append(definition) + continue + + if stripped.startswith("#:") or stripped.startswith("#*"): + example = strip_wikicode(stripped[2:].strip()) + if example: + current["examples"].append(example) + continue + + if current_subsection.startswith("sinonim") and stripped.startswith("*"): + synonym = strip_wikicode(stripped.lstrip("*").strip()) + if synonym: + current["synonyms"].append(synonym) + + flat_definitions = [definition for entry in entries for definition in entry["definitions"]] + topic_hints = infer_topics(flat_definitions, categories) + grammar_hints = infer_grammar_hints(flat_definitions, section_text) + + return { + "entries": entries, + "categories": sorted(set(filter(None, categories))), + "definitions": flat_definitions, + "topic_hints": topic_hints, + "grammar_hints": grammar_hints, + } + + +def wiktionary_payload_for_entry(entry: Dict[str, object], api_response: Dict[str, object]) -> Dict[str, object]: + status = str(api_response.get("status", "missing")) + if status != "ok": + return { + "status": status, + "matched": False, + "page_title": api_response.get("title") or entry.get("form"), + "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(entry.get('form', '')))}", + "definitions": [], + "entries": [], + "topic_hints": [], + "grammar_hints": [], + "categories": [], + } + + italian_section = extract_italian_section(str(api_response.get("wikitext") or "")) + if not italian_section: + return { + "status": "no_italian_section", + "matched": False, + "page_title": api_response.get("title") or entry.get("form"), + "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}", + "definitions": [], + "entries": [], + "topic_hints": [], + "grammar_hints": [], + "categories": [], + } + + parsed = parse_wiktionary_section(italian_section) + matched = bool(parsed["definitions"]) + + return { + "status": "enriched" if matched else "section_without_definitions", + "matched": matched, + "page_title": api_response.get("title") or entry.get("form"), + "pageid": api_response.get("pageid"), + "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}", + "definitions": parsed["definitions"], + "entries": parsed["entries"], + "topic_hints": parsed["topic_hints"], + "grammar_hints": parsed["grammar_hints"], + "categories": parsed["categories"], + "raw_excerpt": italian_section[:4000], + } + + +def select_targets( + refined_payload: Dict[str, object], + review_payload: Dict[str, object], + review_reasons: set[str], + explicit_words: set[str], + word_limit: int, + skip_existing: bool, +) -> Tuple[List[Dict[str, object]], int]: + refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)] + refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")} + + if explicit_words: + selected = [] + skipped_existing_count = 0 + for word in explicit_words: + entry = refined_by_word.get(word) + if entry is None: + continue + if skip_existing and wiktionary_already_useful(entry): + skipped_existing_count += 1 + continue + selected.append(entry) + selected = selected[:word_limit] if word_limit > 0 else selected + return selected, skipped_existing_count + + review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)] + selected_words: List[str] = [] + seen = set() + skipped_existing_count = 0 + + for review_entry in review_entries: + word = str(review_entry.get("form", "")).strip().lower() + if not word or word in seen: + continue + reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []} + refined = refined_by_word.get(word) + if refined is None: + continue + if skip_existing and wiktionary_already_useful(refined): + skipped_existing_count += 1 + continue + babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower() + if reasons.intersection(review_reasons) or babelnet_status == "no_match": + selected_words.append(word) + seen.add(word) + if word_limit > 0 and len(selected_words) >= word_limit: + break + + return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count + + +def wiktionary_already_useful(entry: Dict[str, object]) -> bool: + wiktionary = entry.get("wiktionary", {}) + if not isinstance(wiktionary, dict): + return False + status = str(wiktionary.get("status", "")).lower() + if status == "enriched" and (wiktionary.get("definitions") or wiktionary.get("entries")): + return True + if status in {"missing", "no_italian_section", "section_without_definitions", "empty"}: + return True + return False + + +def enrich_from_wiktionary(args: argparse.Namespace) -> Dict[str, object]: + refined_payload = load_json(args.input, {"entries": []}) + if not isinstance(refined_payload, dict) or "entries" not in refined_payload: + raise ValueError(f"Lessico refined non valido: {args.input}") + + review_payload = load_json(args.review, {"entries": []}) + if not isinstance(review_payload, dict): + review_payload = {"entries": []} + + cache = load_json(args.cache, {}) + if not isinstance(cache, dict): + cache = {} + + targets, skipped_existing_count = select_targets( + refined_payload, + review_payload, + parse_csv_set(args.review_reasons), + parse_csv_set(args.words), + args.word_limit, + args.skip_existing, + ) + + enriched_entries = [] + cache_hits = 0 + network_calls = 0 + network_attempts = 0 + processed_count = 0 + stopped_reason = None + stop_word = None + + print( + f"Target selezionati: {len(targets)}" + + (f" | già saltati per wiktionary esistente: {skipped_existing_count}" if args.skip_existing else "") + ) + + def persist_progress() -> None: + refined_index = { + entry_key(entry): entry + for entry in refined_payload.get("entries", []) or [] + if isinstance(entry, dict) + } + for item in enriched_entries: + refined_index[entry_key(item)] = item + + merged_entries = list(refined_index.values()) + merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", "")))) + + merged_payload = { + "meta": { + **(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}), + "wiktionary_source": args.api_url, + "wiktionary_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "wiktionary_target_count": len(targets), + "wiktionary_processed_count": processed_count, + "wiktionary_skipped_existing_count": skipped_existing_count, + "wiktionary_cache_hits": cache_hits, + "wiktionary_network_calls": network_calls, + "wiktionary_network_attempts": network_attempts, + "wiktionary_stopped_reason": stopped_reason, + "wiktionary_stop_word": stop_word, + }, + "entries": merged_entries, + } + + write_json(args.cache, cache) + write_json(args.output, merged_payload) + + for index, entry in enumerate(targets, start=1): + updated = deepcopy(entry) + word = str(entry.get("form", "")).strip() + cache_key = word.lower() + + if cache_key in cache: + api_response = cache[cache_key] + cache_hits += 1 + else: + try: + network_attempts += 1 + api_response = fetch_wikitext_with_retry(word, args) + except urllib.error.HTTPError as exc: + if exc.code == 429: + stop_word = word + stopped_reason = f"http_429_after_{processed_count}_words" + print(f"[STOP] Wiktionary ha risposto 429 su '{word}'. Salvo il progresso e interrompo il batch.") + persist_progress() + return { + "target_count": len(targets), + "processed_count": processed_count, + "skipped_existing_count": skipped_existing_count, + "cache_hits": cache_hits, + "network_calls": network_calls, + "network_attempts": network_attempts, + "output": str(args.output), + "stopped_reason": stopped_reason, + "stop_word": stop_word, + } + raise + cache[cache_key] = api_response + network_calls += 1 + if args.sleep > 0: + time.sleep(args.sleep) + + updated["wiktionary"] = wiktionary_payload_for_entry(updated, api_response) + updated["wiktionary_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds") + enriched_entries.append(updated) + processed_count += 1 + print( + f"[{index}/{len(targets)}] {word}: " + f"status={updated['wiktionary'].get('status')} " + f"def={len(updated['wiktionary'].get('definitions', []))} " + f"topics={len(updated['wiktionary'].get('topic_hints', []))}" + ) + if args.save_every > 0 and processed_count % int(args.save_every) == 0: + persist_progress() + print(f"[save] progresso salvato dopo {processed_count} parole") + + persist_progress() + + return { + "target_count": len(targets), + "processed_count": processed_count, + "skipped_existing_count": skipped_existing_count, + "cache_hits": cache_hits, + "network_calls": network_calls, + "network_attempts": network_attempts, + "output": str(args.output), + "stopped_reason": stopped_reason, + "stop_word": stop_word, + } + + +def main() -> None: + args = parse_args() + result = enrich_from_wiktionary(args) + print(f"Lessico con Wiktionary generato: {result['output']}") + print(f"Voci trattate: {result.get('processed_count', result['target_count'])}/{result['target_count']}") + if "skipped_existing_count" in result: + print(f"Voci già saltate: {result['skipped_existing_count']}") + print(f"Cache hit: {result['cache_hits']}") + print(f"Chiamate rete: {result['network_calls']}") + if "network_attempts" in result: + print(f"Tentativi di rete: {result['network_attempts']}") + if result.get("stopped_reason"): + print(f"Batch interrotto: {result['stopped_reason']}") + if result.get("stop_word"): + print(f"Ultima parola bloccante: {result['stop_word']}") + + +if __name__ == "__main__": + main() diff --git a/main.py b/main.py index 9768bf9..68426d8 100644 --- a/main.py +++ b/main.py @@ -2,9 +2,14 @@ from __future__ import annotations import argparse import json +import os +import random from pathlib import Path +from types import SimpleNamespace from typing import Dict, List +from build_babelnet_enrichment import BABELNET_ENV_KEY, BABELNET_OUTPUT_PATH, BABELNET_LOCAL_KEY_PATH, load_babelnet_api_key +from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH from build_vocabulary import ( FILTERED_OUTPUT_PATH, METADATA_OUTPUT_PATH, @@ -13,6 +18,7 @@ from build_vocabulary import ( ) from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon +from clue_generator import generate_clues, load_enriched_entries from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata from crossword_generator import CrosswordGenerator, WORDS, render_grid @@ -26,6 +32,14 @@ DIFFICULTY_ALIASES: Dict[str, int] = { DEFAULT_TOPIC = "general" DEFAULT_INITIAL_WORD_COUNT = len(WORDS) +DEFAULT_RUNTIME_LEXICON_CANDIDATES = ( + "lexicon_it_curated_llm_aggressive.json", + "lexicon_it_curated_llm.json", + "lexicon_it_curated.json", + "lexicon_it_refined_plus_wiktextract.json", + ENRICHED_LEXICON_OUTPUT_PATH.name, + SEMANTIC_LEXICON_OUTPUT_PATH.name, +) ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo") FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"} GENERAL_FILL_MIN_QUALITY = 6 @@ -92,6 +106,8 @@ TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = { ), } +ACTIVE_LEXICON_PATH: Path | None = None + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.") @@ -115,6 +131,23 @@ def parse_args() -> argparse.Namespace: action="store_true", help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.", ) + parser.add_argument( + "--babelnet-enrich", + action="store_true", + help="Prima di generare il cruciverba arricchisce incrementalmente il lessico con BabelNet.", + ) + parser.add_argument( + "--babelnet-limit", + type=int, + default=20, + help="Numero massimo di parole da interrogare su BabelNet in questa esecuzione.", + ) + parser.add_argument( + "--babelnet-sleep", + type=float, + default=0.2, + help="Pausa in secondi tra richieste BabelNet consecutive.", + ) parser.add_argument( "--vocabulary", type=Path, @@ -159,7 +192,13 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--topic", default=DEFAULT_TOPIC, - help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.", + help="Tema del cruciverba. Puoi indicare un topic o una lista separata da virgole, es. transport,nature,ecology. Se lasci general, i topic possono essere scelti dal lessico con --max-topics.", + ) + parser.add_argument( + "--max-topics", + type=int, + default=1, + help="Numero massimo di topic casuali da scegliere dal lessico arricchito quando --topic e' general. Massimo consigliato: 3.", ) parser.add_argument( "--initial-word-count", @@ -173,6 +212,26 @@ def parse_args() -> argparse.Namespace: default=DEFAULT_THEMED_FILL_WORD_COUNT, help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.", ) + parser.add_argument( + "--definitions", + action="store_true", + help="Genera e stampa le definizioni per le parole inserite nel cruciverba.", + ) + parser.add_argument( + "--lexicon", + type=Path, + default=None, + help=( + "File lessicale da usare durante l'esecuzione. Se omesso, il programma usa il lessico " + "piu avanzato disponibile, preferendo lexicon_it_curated_llm_aggressive.json." + ), + ) + parser.add_argument( + "--definition-babelnet-limit", + type=int, + default=20, + help="Numero massimo di parole del cruciverba da arricchire al volo con BabelNet per generare definizioni.", + ) return parser.parse_args() @@ -222,6 +281,220 @@ def ensure_semantic_lexicon(args: argparse.Namespace) -> None: print(f"- match semantici: {matched}") +def ensure_babelnet_enrichment(args: argparse.Namespace) -> None: + if not args.babelnet_enrich: + return + if args.babelnet_limit <= 0: + print("BabelNet enrichment saltato: --babelnet-limit <= 0") + return + + from babelnet_incremental_enricher import run_incremental_enrichment + + namespace = SimpleNamespace( + api_key=load_babelnet_api_key(), + topic=primary_topic(args.topic), + difficulty=args.difficulty, + limit=args.babelnet_limit, + sleep=args.babelnet_sleep, + semantic=SEMANTIC_LEXICON_OUTPUT_PATH, + babelnet=BABELNET_OUTPUT_PATH, + enriched=ENRICHED_LEXICON_OUTPUT_PATH, + dry_run=False, + retry_no_match=False, + ) + + print("Arricchimento BabelNet incrementale") + print(f"- tema guida: {primary_topic(args.topic)}") + print(f"- topic attivi: {args.topic}") + print(f"- limite parole: {args.babelnet_limit}") + print(f"- chiave: {BABELNET_ENV_KEY} oppure {BABELNET_LOCAL_KEY_PATH.name}") + result = run_incremental_enrichment(namespace) + print("Riepilogo BabelNet") + print(f"- parole interrogate: {result['selected_count']}") + print(f"- chiamate API reali: {result['api_call_count']}") + print(f"- risposte da cache: {result['cache_hit_count']}") + print(f"- match: {result['matched_count']}") + for item in result["word_logs"]: + print( + f" {item['word']}: api_calls={item['api_calls']}, " + f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, " + f"match={item['matched']}, synsets={item['synsets']}" + ) + print() + + +def enrich_words_for_definitions(args: argparse.Namespace, words: List[str]) -> None: + if not args.definitions: + return + if args.definition_babelnet_limit <= 0: + print("Arricchimento BabelNet per definizioni saltato: --definition-babelnet-limit <= 0") + return + + from babelnet_incremental_enricher import run_incremental_enrichment + + namespace = SimpleNamespace( + api_key=load_babelnet_api_key(), + topic=primary_topic(args.topic), + difficulty=args.difficulty, + limit=args.definition_babelnet_limit, + sleep=args.babelnet_sleep, + semantic=SEMANTIC_LEXICON_OUTPUT_PATH, + babelnet=BABELNET_OUTPUT_PATH, + enriched=ENRICHED_LEXICON_OUTPUT_PATH, + dry_run=False, + retry_no_match=False, + words=words, + ) + + print() + print("Arricchimento BabelNet per definizioni") + print(f"- parole nel cruciverba: {len(set(words))}") + print(f"- limite parole: {args.definition_babelnet_limit}") + result = run_incremental_enrichment(namespace) + print("Riepilogo BabelNet definizioni") + print(f"- parole interrogate: {result['selected_count']}") + print(f"- chiamate API reali: {result['api_call_count']}") + print(f"- risposte da cache: {result['cache_hit_count']}") + print(f"- match: {result['matched_count']}") + for item in result["word_logs"]: + print( + f" {item['word']}: api_calls={item['api_calls']}, " + f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, " + f"match={item['matched']}, synsets={item['synsets']}" + ) + + +def placement_words(placements) -> List[str]: + return [placement.word for placement in placements] + + +def print_definitions(args: argparse.Namespace, state) -> None: + if not args.definitions: + return + entries = load_enriched_entries(resolve_runtime_lexicon_path(args.lexicon)) + clues = generate_clues(state.placements, entries, primary_topic(args.topic), args.difficulty) + print() + print("Definizioni:") + for clue in clues: + print( + f"{clue.number:>2}. {clue.direction} ({clue.x}, {clue.y}) " + f"[{clue.source}] {clue.text} -> {clue.word.upper()}" + ) + print_alpha_diagnostics(args, state, entries) + + +def word_is_on_topic(entry: Dict[str, object], topic: str) -> bool: + active_topics = parse_topics(topic) + if len(active_topics) > 1: + return any(word_is_on_topic(entry, item) for item in active_topics) + + normalized_topic = active_topics[0] + if normalized_topic == DEFAULT_TOPIC: + return True + + topics = {str(item).lower() for item in entry.get("topics", []) if item} + if normalized_topic in topics: + return True + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item} + if normalized_topic in semantic_topics: + return True + + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict): + try: + topic_score = int(best_synset.get("topic_score", 0)) + except (TypeError, ValueError): + topic_score = 0 + if best_synset.get("topic") == normalized_topic and topic_score >= 40: + return True + + try: + return strong_topic_relevance(entry, normalized_topic) > 0 + except Exception: + return False + + +def pos_label(pos: str) -> str: + labels = { + "NOUN": "sostantivi", + "ADJ": "aggettivi", + "VERB": "verbi", + "ADV": "avverbi", + "PREP": "preposizioni", + "CONJ": "congiunzioni", + } + return labels.get(str(pos).upper(), "altri") + + +def print_alpha_diagnostics(args: argparse.Namespace, state, entries: Dict[str, Dict[str, object]]) -> None: + words = placement_words(state.placements) + unique_words = list(dict.fromkeys(word.lower() for word in words)) + active_topics = parse_topics(args.topic) + total_cells = state.area() + filled_cells = len(state.grid) + empty_cells = total_cells - filled_cells + empty_ratio = empty_cells / total_cells if total_cells else 0.0 + filled_ratio = filled_cells / total_cells if total_cells else 0.0 + target_empty_cells = round(total_cells * args.target_empty_ratio) + target_delta = empty_cells - target_empty_cells + topic_words = [] + off_topic_words = [] + topic_distribution = {topic: 0 for topic in active_topics if topic != DEFAULT_TOPIC} + pos_counts = { + "sostantivi": 0, + "aggettivi": 0, + "verbi": 0, + "avverbi": 0, + "preposizioni": 0, + "congiunzioni": 0, + "altri": 0, + } + + for word in unique_words: + entry = entries.get(word, {}) + label = pos_label(str(entry.get("pos", ""))) + pos_counts[label] = pos_counts.get(label, 0) + 1 + if entry and word_is_on_topic(entry, args.topic): + topic_words.append(word) + for selected_topic in topic_distribution: + if word_is_on_topic(entry, selected_topic): + topic_distribution[selected_topic] += 1 + else: + off_topic_words.append(word) + + print() + print("Diagnostica alpha:") + print(f"- parole uniche nello schema: {len(unique_words)}") + print(f"- celle totali: {total_cells}") + print(f"- celle riempite: {filled_cells} ({filled_ratio * 100:.1f}%)") + print(f"- celle vuote: {empty_cells} ({empty_ratio * 100:.1f}%)") + print(f"- target celle vuote: {target_empty_cells} ({args.target_empty_ratio * 100:.1f}%)") + if target_delta > 0: + print(f"- distanza dal target: {target_delta} celle vuote in piu del target") + elif target_delta < 0: + print(f"- distanza dal target: {-target_delta} celle vuote in meno del target") + else: + print("- distanza dal target: centrato") + print(f"- topic richiesti: {', '.join(active_topics)}") + print(f"- parole in tema: {len(topic_words)}") + print(f"- parole fuori tema o non classificate: {len(off_topic_words)}") + if topic_distribution: + print("- distribuzione topic:") + for selected_topic, count in topic_distribution.items(): + print(f" {selected_topic}: {count}") + if topic_words: + print(f"- elenco in tema: {', '.join(topic_words)}") + if off_topic_words: + print(f"- elenco fuori tema/non classificate: {', '.join(off_topic_words)}") + print("- parti del discorso:") + for label in ("sostantivi", "aggettivi", "verbi", "avverbi", "preposizioni", "congiunzioni", "altri"): + print(f" {label}: {pos_counts.get(label, 0)}") + + def parse_difficulty(value: str) -> int: text = str(value).strip().lower() if text in DIFFICULTY_ALIASES: @@ -243,7 +516,30 @@ def load_selected_vocabulary(path: Path | None) -> List[str]: return path.read_text(encoding="utf-8").splitlines() -def load_semantic_payload() -> Dict[str, object]: +def resolve_runtime_lexicon_path(requested: Path | None) -> Path: + global ACTIVE_LEXICON_PATH + if requested is not None: + path = requested if requested.is_absolute() else Path(__file__).resolve().parent / requested + if not path.exists(): + raise SystemExit(f"Il lessico specificato con --lexicon non esiste: {path}") + ACTIVE_LEXICON_PATH = path + return path + if ACTIVE_LEXICON_PATH is not None: + return ACTIVE_LEXICON_PATH + base_dir = Path(__file__).resolve().parent + for candidate in DEFAULT_RUNTIME_LEXICON_CANDIDATES: + path = base_dir / candidate + if path.exists(): + ACTIVE_LEXICON_PATH = path + return path + ACTIVE_LEXICON_PATH = ENRICHED_LEXICON_OUTPUT_PATH + return ACTIVE_LEXICON_PATH + + +def load_semantic_payload(path: Path | None = None) -> Dict[str, object]: + runtime_path = resolve_runtime_lexicon_path(path) + if runtime_path.exists(): + return json.loads(runtime_path.read_text(encoding="utf-8")) if not SEMANTIC_LEXICON_OUTPUT_PATH.exists(): lexicon = build_semantic_lexicon() SEMANTIC_LEXICON_OUTPUT_PATH.write_text( @@ -253,6 +549,74 @@ def load_semantic_payload() -> Dict[str, object]: return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8")) +def parse_topics(value: str) -> List[str]: + topics = [] + seen = set() + for raw_topic in str(value or DEFAULT_TOPIC).split(","): + topic = raw_topic.strip().lower() + if not topic or topic in seen: + continue + topics.append(topic) + seen.add(topic) + return topics or [DEFAULT_TOPIC] + + +def primary_topic(value: str) -> str: + return parse_topics(value)[0] + + +def available_topics_from_lexicon(payload: Dict[str, object], *, min_words: int = 5) -> List[str]: + counts: Dict[str, int] = {} + excluded = {DEFAULT_TOPIC, "abstract", "actions"} + for entry in payload.get("entries", []) or []: + if not isinstance(entry, dict): + continue + if not entry.get("allowed_in_crossword", False): + continue + for topic in entry.get("topics", []) or []: + normalized = str(topic).strip().lower() + if not normalized or normalized in excluded: + continue + counts[normalized] = counts.get(normalized, 0) + 1 + return sorted(topic for topic, count in counts.items() if count >= min_words) + + +def resolve_topics(args: argparse.Namespace, difficulty_level: int) -> List[str]: + requested = parse_topics(args.topic) + max_topics = max(1, min(3, int(args.max_topics))) + if requested != [DEFAULT_TOPIC]: + selected = requested[:max_topics] + args.topic = ",".join(selected) + args.topic_seed_counts = { + topic: len(select_initial_words(difficulty_level, topic, args.initial_word_count)) + for topic in selected + } + return selected + + if max_topics <= 1: + args.topic = DEFAULT_TOPIC + args.topic_seed_counts = {} + return [DEFAULT_TOPIC] + + candidates = [] + for candidate in available_topics_from_lexicon(load_semantic_payload(), min_words=1): + available = len(select_initial_words(difficulty_level, candidate, args.initial_word_count)) + if available > 0: + candidates.append((candidate, available)) + if not candidates: + args.topic = DEFAULT_TOPIC + args.topic_seed_counts = {} + return [DEFAULT_TOPIC] + + rng = random.Random(args.seed) + rng.shuffle(candidates) + selected_pairs = candidates[: min(max_topics, len(candidates))] + selected = [topic for topic, _ in selected_pairs] + args.topic = ",".join(selected) + args.topic_seed_counts = dict(selected_pairs) + return selected + + def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]: topics = {str(item).lower() for item in entry.get("topics", [])} semantic_topics = { @@ -271,6 +635,10 @@ def matches_topic_roots(word: str, selected_topic: str) -> bool: def topic_relevance(entry: Dict[str, object], topic: str) -> int: + active_topics = parse_topics(topic) + if len(active_topics) > 1: + return max(topic_relevance(entry, item) for item in active_topics) + selected_topic = topic.strip().lower() if selected_topic == DEFAULT_TOPIC: return 20 @@ -295,6 +663,10 @@ def topic_relevance(entry: Dict[str, object], topic: str) -> int: def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int: + active_topics = parse_topics(topic) + if len(active_topics) > 1: + return max(strong_topic_relevance(entry, item) for item in active_topics) + selected_topic = topic.strip().lower() if selected_topic == DEFAULT_TOPIC: return 20 @@ -341,7 +713,7 @@ def is_general_fill_support(entry: Dict[str, object]) -> bool: def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]: payload = load_semantic_payload() - normalized_topic = topic.strip().lower() + normalized_topic = ",".join(parse_topics(topic)) eligible = [ entry @@ -400,6 +772,40 @@ def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[ def select_initial_words(level: int, topic: str, count: int) -> List[str]: + active_topics = parse_topics(topic) + if len(active_topics) > 1: + topic_pools = { + selected_topic: select_initial_words(level, selected_topic, count) + for selected_topic in active_topics + } + selected: List[str] = [] + indexes = {selected_topic: 0 for selected_topic in active_topics} + + while len(selected) < count: + progressed = False + for selected_topic in active_topics: + pool = topic_pools.get(selected_topic, []) + while indexes[selected_topic] < len(pool) and pool[indexes[selected_topic]] in selected: + indexes[selected_topic] += 1 + if indexes[selected_topic] >= len(pool): + continue + selected.append(pool[indexes[selected_topic]]) + indexes[selected_topic] += 1 + progressed = True + if len(selected) >= count: + break + if not progressed: + break + + if len(selected) < count: + fallback = select_initial_words(level, DEFAULT_TOPIC, count) + for word in fallback: + if word not in selected: + selected.append(word) + if len(selected) >= count: + break + return selected[:count] + payload = load_semantic_payload() normalized_topic = topic.strip().lower() abstract_like_topics = {"abstract", "actions"} @@ -408,6 +814,10 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]: topics, semantic_topics = entry_topics(entry) return selected_topic in topics + def semantic_matches(entry: Dict[str, object], selected_topic: str) -> bool: + topics, semantic_topics = entry_topics(entry) + return selected_topic in semantic_topics and selected_topic not in topics + def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]: topics, semantic_topics = entry_topics(entry) quality = int(entry.get("quality_score", 0)) @@ -479,6 +889,33 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]: return False return True + def is_semantic_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool: + word = str(entry.get("form", "")) + pos = str(entry.get("pos", "")) + topics, semantic_topics = entry_topics(entry) + if selected_topic not in semantic_topics: + return False + if len(word) < 4 or len(word) > 13: + return False + if pos not in {"NOUN", "ADJ", "VERB"}: + return False + if word.endswith(ABSTRACTISH_SUFFIXES): + return False + if "abstract" in topics: + return False + blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ()) + if any(part in word for part in blocked_substrings): + return False + required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic) + if ( + selected_topic in CONCRETE_TOPICS + and required_substrings + and selected_topic != DEFAULT_TOPIC + and not any(part in word for part in required_substrings) + ): + return False + return True + def overlap_score(left: str, right: str) -> int: shared = set(left) & set(right) return sum(min(left.count(ch), right.count(ch)) for ch in shared) @@ -548,6 +985,20 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]: relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True) selected = pick_seed_set(strict_pool, normalized_topic, count) + if len(selected) < count and normalized_topic != DEFAULT_TOPIC: + semantic_pool = [ + entry + for entry in eligible + if semantic_matches(entry, normalized_topic) + and is_semantic_seed_friendly(entry, normalized_topic) + ] + semantic_selected = pick_seed_set(semantic_pool, normalized_topic, count) + for word in semantic_selected: + if word not in selected: + selected.append(word) + if len(selected) >= count: + break + if len(selected) < count and normalized_topic == DEFAULT_TOPIC: relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count) for word in relaxed_selected: @@ -569,10 +1020,13 @@ def select_initial_words(level: int, topic: str, count: int) -> List[str]: def main() -> None: args = parse_args() + args.lexicon = resolve_runtime_lexicon_path(args.lexicon) ensure_vocabulary(args) ensure_lexicon(args) ensure_semantic_lexicon(args) difficulty_level = parse_difficulty(args.difficulty) + active_topics = resolve_topics(args, difficulty_level) + ensure_babelnet_enrichment(args) initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count) generator = CrosswordGenerator( @@ -590,7 +1044,13 @@ def main() -> None: print(f"Intersezioni: {initial_state.intersections}") print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})") print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}") - print(f"Tema filler: {args.topic}") + print(f"Topic attivi: {', '.join(active_topics)}") + print(f"Lessico runtime: {args.lexicon.name}") + if getattr(args, "topic_seed_counts", None): + print( + "Parole-seme disponibili per topic: " + + ", ".join(f"{topic}={count}" for topic, count in args.topic_seed_counts.items()) + ) if args.seed is not None: print(f"Seed: {args.seed}") print() @@ -600,6 +1060,9 @@ def main() -> None: print(", ".join(initial_words)) if args.skip_fill: + initial_words_for_clues = [placement.word for placement in initial_state.placements] + enrich_words_for_definitions(args, initial_words_for_clues) + print_definitions(args, initial_state) return vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic) @@ -632,6 +1095,10 @@ def main() -> None: direction = "orizzontale" if placement.direction == "H" else "verticale" print(f"{index:>2}. {placement.word} ({placement.x}, {placement.y}) {direction}") + final_words = [placement.word for placement in final_state.placements] + enrich_words_for_definitions(args, final_words) + print_definitions(args, final_state) + if __name__ == "__main__": main() diff --git a/refine_lexicon_topics.py b/refine_lexicon_topics.py new file mode 100644 index 0000000..99e5533 --- /dev/null +++ b/refine_lexicon_topics.py @@ -0,0 +1,473 @@ +from __future__ import annotations + +import argparse +import json +import re +from copy import deepcopy +from datetime import datetime +from pathlib import Path +from typing import Dict, Iterable, List, Tuple + +from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH + + +REFINED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined.json") + +TOPIC_KEYWORDS: Dict[str, Tuple[str, ...]] = { + "religion": ( + "abbazia", + "abate", + "arcivescovo", + "cappella", + "cardinale", + "chiesa", + "clero", + "convento", + "diocesi", + "ecclesiast", + "fede", + "frate", + "mistica", + "monaco", + "monastero", + "parrocchia", + "prete", + "religion", + "sacerdot", + "santo", + "vescovo", + ), + "ecclesiastical_hierarchy": ( + "abate", + "arcivescovo", + "carica ecclesiastica", + "cardinale", + "clero", + "dignità ecclesiastica", + "ecclesiast", + "ordinazione", + "parroco", + "patriarca", + "pontefice", + "prete", + "priore", + "superiore del monastero", + "vescovo", + ), + "honorific_title": ( + "carica", + "epiteto", + "nobile", + "onore", + "onorific", + "titolo", + ), + "mysticism": ( + "asceta", + "contemplazione", + "estasi", + "mistica", + "mistico", + "monachesimo", + "spiritual", + ), + "geography": ( + "borgo", + "città", + "comune", + "frazione", + "geografia", + "isola", + "località", + "paese", + "provincia", + "regione", + "stato", + "toponimo", + "valle", + ), + "transport": ( + "aereo", + "aeroplano", + "auto", + "autobus", + "autocarro", + "barca", + "bicicletta", + "imbarcazione", + "locomotiva", + "motore", + "nave", + "pista", + "porto", + "stazione", + "traghetto", + "treno", + "trasport", + "veicolo", + "viaggio", + ), + "nature": ( + "acqua", + "albero", + "animale", + "bosco", + "fiore", + "fiume", + "foresta", + "mare", + "montagna", + "natura", + "pianta", + "terra", + ), + "health": ( + "ambulanza", + "anemia", + "cura", + "farmaco", + "malattia", + "medic", + "ospedale", + "paziente", + "salute", + "soccorso", + "terapia", + ), + "war": ( + "arma", + "artiglieria", + "assalto", + "battaglia", + "bombard", + "esercito", + "fortezza", + "guerra", + "militare", + "soldato", + "trincea", + ), +} + +TAG_STOPWORDS = { + "and", + "con", + "da", + "dei", + "del", + "della", + "delle", + "dello", + "di", + "e", + "il", + "in", + "la", + "le", + "lo", + "nel", + "nella", + "per", + "su", + "the", + "un", + "una", +} + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Genera un lessico raffinato con campi aggiuntivi per topic, tag semantici e sensi." + ) + parser.add_argument( + "--input", + type=Path, + default=ENRICHED_LEXICON_OUTPUT_PATH, + help="File lessicale di partenza, tipicamente lexicon_it_enriched.json.", + ) + parser.add_argument( + "--output", + type=Path, + default=REFINED_LEXICON_OUTPUT_PATH, + help="Nuovo file lessicale raffinato da generare.", + ) + parser.add_argument( + "--replace-general", + action="store_true", + help="Se attivo, sostituisce topic=['general'] con i topic suggeriti quando la confidenza e alta.", + ) + parser.add_argument( + "--min-topic-score", + type=int, + default=40, + help="Punteggio minimo per promuovere un topic suggerito nei topics finali.", + ) + return parser.parse_args() + + +def load_json(path: Path) -> Dict[str, object]: + return json.loads(path.read_text(encoding="utf-8")) + + +def write_json(path: Path, payload: Dict[str, object]) -> None: + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + + +def dedupe(items: Iterable[str]) -> List[str]: + result: List[str] = [] + seen = set() + for item in items: + text = str(item).strip() + if not text: + continue + key = text.lower() + if key in seen: + continue + seen.add(key) + result.append(text) + return result + + +def slugify_tag(text: str) -> str: + value = re.sub(r"[^a-z0-9]+", "_", text.strip().lower(), flags=re.IGNORECASE) + value = value.strip("_") + return value + + +def cleanup_tag(tag: str) -> str: + normalized = slugify_tag(tag) + if not normalized or normalized in TAG_STOPWORDS or len(normalized) <= 1: + return "" + return normalized + + +def flatten_text(entry: Dict[str, object]) -> str: + chunks: List[str] = [] + chunks.extend(str(topic) for topic in entry.get("topics", []) or []) + + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + chunks.extend(str(topic) for topic in semantic.get("semantic_topics", []) or []) + chunks.extend(str(gloss) for gloss in semantic.get("glosses", []) or []) + for synset in semantic.get("synsets", []) or []: + if isinstance(synset, dict): + chunks.append(str(synset.get("definition", ""))) + chunks.extend(str(item) for item in synset.get("lemmas", []) or []) + + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + chunks.extend(str(item) for item in babelnet.get("synset_refs", []) or []) + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict): + chunks.extend(str(item) for item in best_synset.get("glosses", []) or []) + chunks.extend(str(item) for item in best_synset.get("categories", []) or []) + chunks.extend(str(item) for item in best_synset.get("domains", []) or []) + chunks.extend(str(item) for item in best_synset.get("senses", []) or []) + for synset in babelnet.get("synsets", []) or []: + if isinstance(synset, dict): + chunks.extend(str(item) for item in synset.get("glosses", []) or []) + chunks.extend(str(item) for item in synset.get("categories", []) or []) + chunks.extend(str(item) for item in synset.get("domains", []) or []) + chunks.extend(str(item) for item in synset.get("senses", []) or []) + + return " ".join(chunks).lower() + + +def infer_topic_scores(entry: Dict[str, object]) -> Dict[str, int]: + text = flatten_text(entry) + scores: Dict[str, int] = {} + for topic, keywords in TOPIC_KEYWORDS.items(): + score = 0 + for keyword in keywords: + occurrences = text.count(keyword.lower()) + if occurrences: + score += 12 * occurrences + if score: + scores[topic] = min(score, 100) + return scores + + +def collect_semantic_tags(entry: Dict[str, object]) -> List[str]: + tags: List[str] = [] + tags.extend(str(topic) for topic in entry.get("topics", []) or []) + + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + tags.extend(str(topic) for topic in semantic.get("semantic_topics", []) or []) + for relation_group in (semantic.get("raw_relation_terms", {}) or {}).values(): + tags.extend(str(item) for item in relation_group or []) + + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict): + tags.extend(str(item) for item in best_synset.get("categories", []) or []) + tags.extend(str(item) for item in best_synset.get("domains", []) or []) + for synset in babelnet.get("synsets", []) or []: + if isinstance(synset, dict): + tags.extend(str(item) for item in synset.get("categories", []) or []) + tags.extend(str(item) for item in synset.get("domains", []) or []) + + cleaned = [cleanup_tag(tag) for tag in tags] + return [tag for tag in dedupe(cleaned) if tag] + + +def collect_senses(entry: Dict[str, object], topic_scores: Dict[str, int]) -> List[Dict[str, object]]: + senses: List[Dict[str, object]] = [] + + semantic = entry.get("semantic", {}) + if isinstance(semantic, dict): + for synset in semantic.get("synsets", []) or []: + if not isinstance(synset, dict): + continue + definition = str(synset.get("definition", "")).strip() + if not definition: + continue + senses.append( + { + "source": "semantic", + "id": synset.get("id"), + "definition": definition, + "lemmas": dedupe(str(item) for item in synset.get("lemmas", []) or []), + "topics": dedupe( + list(semantic.get("semantic_topics", []) or []) + + [topic for topic, score in topic_scores.items() if score >= 50] + ), + "confidence": 0.7, + } + ) + + babelnet = entry.get("babelnet", {}) + if isinstance(babelnet, dict): + best_synset = babelnet.get("best_synset", {}) + if isinstance(best_synset, dict) and best_synset.get("id"): + glosses = [str(item).strip() for item in best_synset.get("glosses", []) or [] if str(item).strip()] + if glosses: + senses.append( + { + "source": "babelnet", + "id": best_synset.get("id"), + "definition": glosses[0], + "lemmas": dedupe(str(item) for item in best_synset.get("senses", []) or []), + "topics": dedupe( + [str(best_synset.get("topic", "")).strip()] + + [topic for topic, score in topic_scores.items() if score >= 50] + ), + "confidence": round(min(max(float(best_synset.get("topic_score", 0)) / 100.0, 0.4), 0.95), 2), + } + ) + + return senses + + +def collect_geo_tags(entry: Dict[str, object]) -> List[str]: + babelnet = entry.get("babelnet", {}) + tags: List[str] = [] + if isinstance(babelnet, dict): + for synset in babelnet.get("synsets", []) or []: + if not isinstance(synset, dict): + continue + for category in synset.get("categories", []) or []: + text = str(category).lower() + if any(keyword in text for keyword in ("comuni_", "province_", "regioni_", "città", "paesi", "località")): + tags.append("toponym_possible") + return dedupe(tags) + + +def collect_name_tags(entry: Dict[str, object]) -> List[str]: + tags: List[str] = [] + form = str(entry.get("form", "")) + if form[:1].isupper(): + tags.append("capitalized_form") + return dedupe(tags) + + +def should_review(entry: Dict[str, object], topic_scores: Dict[str, int], senses: List[Dict[str, object]]) -> bool: + existing_topics = [str(topic).lower() for topic in entry.get("topics", []) or []] + best_score = max(topic_scores.values(), default=0) + strong_topics = [topic for topic, score in topic_scores.items() if score >= 50] + babelnet_status = str((entry.get("babelnet", {}) or {}).get("status", "")) + + if existing_topics == ["general"] and not strong_topics: + return True + if babelnet_status == "ambiguous" and best_score < 50: + return True + if len(senses) >= 3 and len(strong_topics) >= 2: + return True + return False + + +def promoted_topics( + existing_topics: List[str], topic_scores: Dict[str, int], replace_general: bool, min_topic_score: int +) -> List[str]: + inferred = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0])) if score >= min_topic_score] + existing_clean = dedupe(existing_topics) + + if replace_general and existing_clean == ["general"] and inferred: + return inferred + + return dedupe(existing_clean + inferred) + + +def refine_entry(entry: Dict[str, object], replace_general: bool, min_topic_score: int) -> Dict[str, object]: + refined = deepcopy(entry) + topic_scores = infer_topic_scores(refined) + semantic_tags = collect_semantic_tags(refined) + senses = collect_senses(refined, topic_scores) + geo_tags = collect_geo_tags(refined) + name_tags = collect_name_tags(refined) + current_topics = [str(topic) for topic in refined.get("topics", []) or []] + + refined["topics"] = promoted_topics(current_topics, topic_scores, replace_general, min_topic_score) + refined["semantic_tags"] = semantic_tags + refined["senses"] = senses + refined["topic_confidence"] = topic_scores + refined["topic_suggestions"] = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0]))] + refined["geo_tags"] = geo_tags + refined["name_tags"] = name_tags + refined["needs_review"] = should_review(refined, topic_scores, senses) + return refined + + +def build_refined_lexicon(args: argparse.Namespace) -> Dict[str, object]: + payload = load_json(args.input) + if not isinstance(payload, dict) or "entries" not in payload: + raise ValueError(f"Lessico di input non valido: {args.input}") + + refined_entries = [ + refine_entry(entry, args.replace_general, args.min_topic_score) + for entry in payload.get("entries", []) or [] + if isinstance(entry, dict) + ] + + review_count = sum(1 for entry in refined_entries if entry.get("needs_review")) + topicful_count = sum(1 for entry in refined_entries if len(entry.get("topic_suggestions", []) or []) > 0) + + return { + "meta": { + "language": "it", + "version": 1, + "base_lexicon": args.input.name, + "generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), + "entry_count": len(refined_entries), + "replace_general": args.replace_general, + "min_topic_score": args.min_topic_score, + "review_count": review_count, + "topicful_count": topicful_count, + }, + "entries": refined_entries, + } + + +def main() -> None: + args = parse_args() + payload = build_refined_lexicon(args) + write_json(args.output, payload) + print(f"Lessico raffinato generato: {args.output}") + print(f"Voci totali: {payload['meta']['entry_count']}") + print(f"Voci con suggerimenti di topic: {payload['meta']['topicful_count']}") + print(f"Voci marcate needs_review: {payload['meta']['review_count']}") + + +if __name__ == "__main__": + main() diff --git a/run_babelnet_daily_batch.bat b/run_babelnet_daily_batch.bat new file mode 100644 index 0000000..a1c9c2c --- /dev/null +++ b/run_babelnet_daily_batch.bat @@ -0,0 +1,5 @@ +@echo off +setlocal +cd /d "%~dp0" +python babelnet_daily_batch.py --api-call-limit 1900 --per-key-api-call-limit 950 --sleep 0.2 +endlocal