alpha01 backoffice: crossword engine, lexicon curation and JSON contract

This commit is contained in:
2026-04-29 13:24:04 +02:00
parent a1f8cb8577
commit 47d8957e15
20 changed files with 5985 additions and 16 deletions

View File

@@ -0,0 +1,492 @@
from __future__ import annotations
import argparse
import json
import re
from copy import deepcopy
from datetime import datetime
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
WIKTEXTRACT_INPUT_PATH = Path(__file__).with_name("raw-wiktextract-data.jsonl")
WIKTEXTRACT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktextract.json")
WIKTEXTRACT_INDEX_CACHE_PATH = Path(__file__).with_name(".wiktextract_it_index.json")
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
POS_MAP = {
"noun": "NOUN",
"adj": "ADJ",
"adj": "ADJ",
"verb": "VERB",
"adv": "ADV",
"prep": "PREP",
"conj": "CONJ",
"pron": "PRON",
"intj": "INTJ",
}
TOPIC_MAP = {
"christianity": "religion",
"religion": "religion",
"history": "history",
"agriculture": "agriculture",
"engineering": "technology",
"mechanics": "technology",
"technology": "technology",
"medicine": "health",
"geography": "geography",
"biology": "nature",
"aeronautics": "transport",
}
CATEGORY_TOPIC_HINTS = {
"religione-it": "religion",
"cristianesimo-it": "religion",
"storia-it": "history",
"agricoltura-it": "agriculture",
"medicina-it": "health",
"ingegneria-it": "technology",
"meccanica-it": "technology",
"tecnologia-it": "technology",
"geografia-it": "geography",
"biologia-it": "nature",
"aeronautica-it": "transport",
}
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description=(
"Arricchisce il lessico refined leggendo offline il file raw-wiktextract-data.jsonl, "
"senza effettuare richieste di rete."
)
)
parser.add_argument(
"--input",
type=Path,
default=REFINED_LEXICON_OUTPUT_PATH,
help="Lessico refined di partenza.",
)
parser.add_argument(
"--review",
type=Path,
default=REVIEW_INPUT_PATH,
help="File to_be_review.json da usare per selezionare i lemmi prioritari.",
)
parser.add_argument(
"--wiktextract",
type=Path,
default=WIKTEXTRACT_INPUT_PATH,
help="File JSONL raw estratto da Wiktionary.",
)
parser.add_argument(
"--output",
type=Path,
default=WIKTEXTRACT_OUTPUT_PATH,
help="Lessico refined con blocco wiktextract aggiunto.",
)
parser.add_argument(
"--index-cache",
type=Path,
default=WIKTEXTRACT_INDEX_CACHE_PATH,
help="Cache dell'indice lemmi->righe del JSONL per velocizzare i rilanci.",
)
parser.add_argument(
"--word-limit",
type=int,
default=0,
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
)
parser.add_argument(
"--words",
default="",
help="Lista separata da virgole di lemmi specifici da arricchire.",
)
parser.add_argument(
"--review-reasons",
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
help="Motivi del file review da trattare con priorita, separati da virgole.",
)
parser.add_argument(
"--skip-existing",
action="store_true",
help="Salta le voci che nel lessico di input hanno gia un blocco wiktextract utile.",
)
return parser.parse_args()
def load_json(path: Path, default: object) -> object:
if not path.exists():
return default
return json.loads(path.read_text(encoding="utf-8"))
def write_json(path: Path, payload: object) -> None:
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
def parse_csv_set(value: str) -> set[str]:
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
pos = str(entry.get("pos") or "").strip().upper()
return form, pos
def load_or_build_index(jsonl_path: Path, index_cache_path: Path) -> Dict[str, List[int]]:
cached = load_json(index_cache_path, {})
expected_meta = {
"source": str(jsonl_path.resolve()),
"size": jsonl_path.stat().st_size if jsonl_path.exists() else 0,
"mtime": jsonl_path.stat().st_mtime if jsonl_path.exists() else 0,
}
if (
isinstance(cached, dict)
and cached.get("meta") == expected_meta
and isinstance(cached.get("index"), dict)
):
return {str(key): list(value) for key, value in cached["index"].items()}
index: Dict[str, List[int]] = {}
with jsonl_path.open("r", encoding="utf-8") as handle:
while True:
offset = handle.tell()
line = handle.readline()
if not line:
break
raw = line.rstrip("\n")
if not raw:
continue
obj = json.loads(raw)
if obj.get("lang_code") != "it":
continue
word = str(obj.get("word", "")).strip().lower()
if word:
index.setdefault(word, []).append(offset)
write_json(index_cache_path, {"meta": expected_meta, "index": index})
return index
def read_jsonl_objects_at_offsets(jsonl_path: Path, offsets: Sequence[int]) -> List[Dict[str, object]]:
objects: List[Dict[str, object]] = []
with jsonl_path.open("r", encoding="utf-8") as handle:
for offset in offsets:
handle.seek(offset)
line = handle.readline()
if not line:
continue
objects.append(json.loads(line))
return objects
def map_pos(value: str) -> str:
normalized = str(value or "").strip().lower()
return POS_MAP.get(normalized, normalized.upper() if normalized else "")
def normalize_text(text: str) -> str:
value = str(text or "").strip()
value = re.sub(r"\s+", " ", value)
return value
def sense_topics(sense: Dict[str, object], categories: Sequence[str]) -> List[str]:
topics = set()
for topic in sense.get("topics", []) or []:
normalized = TOPIC_MAP.get(str(topic).strip().lower())
if normalized:
topics.add(normalized)
for category in categories:
normalized = CATEGORY_TOPIC_HINTS.get(str(category).strip().lower())
if normalized:
topics.add(normalized)
return sorted(topics)
def word_level_topics(entries: Sequence[Dict[str, object]], categories: Sequence[str]) -> List[str]:
topics = set()
for entry in entries:
for sense in entry.get("senses", []) or []:
if isinstance(sense, dict):
topics.update(sense_topics(sense, categories))
return sorted(topics)
def grammar_hints(entries: Sequence[Dict[str, object]]) -> List[str]:
hints = set()
for entry in entries:
pos = str(entry.get("pos", "")).lower()
tags = [str(tag).lower() for tag in entry.get("tags", []) or []]
if pos == "verb" and "form-of" in tags:
hints.add("voce_verbale")
if pos == "noun":
for sense in entry.get("senses", []) or []:
if not isinstance(sense, dict):
continue
for gloss in sense.get("glosses", []) or []:
gloss_text = str(gloss).lower()
if "diminutivo" in gloss_text:
hints.add("diminutivo")
if "accrescitivo" in gloss_text:
hints.add("accrescitivo")
if "peggiorativo" in gloss_text:
hints.add("peggiorativo")
for sense in entry.get("senses", []) or []:
if not isinstance(sense, dict):
continue
for gloss in sense.get("glosses", []) or []:
gloss_text = str(gloss).lower()
if "congiuntivo" in gloss_text:
hints.add("congiuntivo")
if "imperativo" in gloss_text:
hints.add("imperativo")
if "plurale" in gloss_text:
hints.add("plurale")
return sorted(hints)
def simplify_entry(obj: Dict[str, object]) -> Dict[str, object]:
categories = [normalize_text(item) for item in obj.get("categories", []) or [] if item]
senses = []
for sense in obj.get("senses", []) or []:
if not isinstance(sense, dict):
continue
glosses = [normalize_text(item) for item in sense.get("glosses", []) or [] if normalize_text(item)]
if not glosses:
continue
senses.append(
{
"glosses": glosses,
"examples": [
normalize_text(example.get("text", ""))
for example in sense.get("examples", []) or []
if isinstance(example, dict) and normalize_text(example.get("text", ""))
],
"topics": sense_topics(sense, categories),
"tags": [str(tag) for tag in sense.get("tags", []) or [] if tag],
"categories": [normalize_text(item) for item in sense.get("categories", []) or [] if item],
}
)
return {
"word": obj.get("word"),
"lang": obj.get("lang"),
"lang_code": obj.get("lang_code"),
"pos": map_pos(str(obj.get("pos", ""))),
"pos_title": obj.get("pos_title"),
"tags": [str(tag) for tag in obj.get("tags", []) or [] if tag],
"categories": categories,
"senses": senses,
"synonyms": [item for item in obj.get("synonyms", []) or [] if isinstance(item, dict) and item.get("word")],
"related": [item for item in obj.get("related", []) or [] if isinstance(item, dict) and item.get("word")],
}
def choose_best_entries(refined_entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[Dict[str, object]]:
target_pos = str(refined_entry.get("pos", "")).upper()
exact = [candidate for candidate in candidates if str(candidate.get("pos", "")).upper() == target_pos]
if exact:
return exact
return list(candidates)
def wiktextract_already_useful(entry: Dict[str, object]) -> bool:
payload = entry.get("wiktextract", {})
if not isinstance(payload, dict):
return False
status = str(payload.get("status", "")).lower()
if status == "enriched" and (payload.get("definitions") or payload.get("entries")):
return True
if status in {"missing", "no_match"}:
return True
return False
def select_targets(
refined_payload: Dict[str, object],
review_payload: Dict[str, object],
review_reasons: set[str],
explicit_words: set[str],
word_limit: int,
skip_existing: bool,
) -> Tuple[List[Dict[str, object]], int]:
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
if explicit_words:
selected = []
skipped_existing_count = 0
for word in explicit_words:
entry = refined_by_word.get(word)
if entry is None:
continue
if skip_existing and wiktextract_already_useful(entry):
skipped_existing_count += 1
continue
selected.append(entry)
selected = selected[:word_limit] if word_limit > 0 else selected
return selected, skipped_existing_count
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
selected_words: List[str] = []
seen = set()
skipped_existing_count = 0
for review_entry in review_entries:
word = str(review_entry.get("form", "")).strip().lower()
if not word or word in seen:
continue
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
refined = refined_by_word.get(word)
if refined is None:
continue
if skip_existing and wiktextract_already_useful(refined):
skipped_existing_count += 1
continue
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
selected_words.append(word)
seen.add(word)
if word_limit > 0 and len(selected_words) >= word_limit:
break
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
def wiktextract_payload_for_entry(refined_entry: Dict[str, object], matches: Sequence[Dict[str, object]]) -> Dict[str, object]:
if not matches:
return {
"status": "missing",
"matched": False,
"definitions": [],
"entries": [],
"topic_hints": [],
"grammar_hints": [],
}
selected_entries = choose_best_entries(refined_entry, matches)
definitions = []
for item in selected_entries:
for sense in item.get("senses", []) or []:
if not isinstance(sense, dict):
continue
definitions.extend(sense.get("glosses", []) or [])
definitions = [normalize_text(item) for item in definitions if normalize_text(item)]
all_categories = []
for item in selected_entries:
all_categories.extend(item.get("categories", []) or [])
return {
"status": "enriched" if definitions else "entries_without_definitions",
"matched": bool(definitions),
"definitions": definitions,
"entries": selected_entries,
"topic_hints": word_level_topics(selected_entries, all_categories),
"grammar_hints": grammar_hints(selected_entries),
"categories": sorted(set(normalize_text(item) for item in all_categories if normalize_text(item))),
}
def enrich_from_wiktextract(args: argparse.Namespace) -> Dict[str, object]:
refined_payload = load_json(args.input, {"entries": []})
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
raise ValueError(f"Lessico refined non valido: {args.input}")
review_payload = load_json(args.review, {"entries": []})
if not isinstance(review_payload, dict):
review_payload = {"entries": []}
targets, skipped_existing_count = select_targets(
refined_payload,
review_payload,
parse_csv_set(args.review_reasons),
parse_csv_set(args.words),
args.word_limit,
args.skip_existing,
)
print(
f"Target selezionati: {len(targets)}"
+ (f" | già saltati per wiktextract esistente: {skipped_existing_count}" if args.skip_existing else "")
)
index = load_or_build_index(args.wiktextract, args.index_cache)
refined_index = {
entry_key(entry): deepcopy(entry)
for entry in refined_payload.get("entries", []) or []
if isinstance(entry, dict)
}
matched_count = 0
missing_count = 0
for idx, entry in enumerate(targets, start=1):
updated = deepcopy(entry)
word = str(entry.get("form", "")).strip().lower()
offsets = index.get(word, [])
objects = [simplify_entry(obj) for obj in read_jsonl_objects_at_offsets(args.wiktextract, offsets)]
payload = wiktextract_payload_for_entry(updated, objects)
updated["wiktextract"] = payload
updated["wiktextract_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
refined_index[entry_key(updated)] = updated
if payload.get("matched"):
matched_count += 1
else:
missing_count += 1
print(
f"[{idx}/{len(targets)}] {word}: "
f"status={payload.get('status')} "
f"def={len(payload.get('definitions', []))} "
f"topics={len(payload.get('topic_hints', []))} "
f"entries={len(payload.get('entries', []))}"
)
merged_entries = list(refined_index.values())
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
merged_payload = {
"meta": {
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
"wiktextract_source": str(args.wiktextract),
"wiktextract_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
"wiktextract_target_count": len(targets),
"wiktextract_skipped_existing_count": skipped_existing_count,
"wiktextract_matched_count": matched_count,
"wiktextract_missing_count": missing_count,
},
"entries": merged_entries,
}
write_json(args.output, merged_payload)
return {
"target_count": len(targets),
"skipped_existing_count": skipped_existing_count,
"matched_count": matched_count,
"missing_count": missing_count,
"output": str(args.output),
}
def main() -> None:
args = parse_args()
result = enrich_from_wiktextract(args)
print(f"Lessico con Wiktextract generato: {result['output']}")
print(f"Voci trattate: {result['target_count']}")
print(f"Voci già saltate: {result['skipped_existing_count']}")
print(f"Match Wiktextract: {result['matched_count']}")
print(f"Senza match Wiktextract: {result['missing_count']}")
if __name__ == "__main__":
main()