Compare commits
4 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 47d8957e15 | |||
| a1f8cb8577 | |||
| b172b9c04b | |||
| 77c7e709b6 |
14
.gitignore
vendored
Normal file
14
.gitignore
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.babelnet_cache.json
|
||||
.wiktionary_cache.json
|
||||
.wiktextract_it_index.json
|
||||
.babelnet_api_key.local
|
||||
logs/
|
||||
raw-wiktextract-data.jsonl
|
||||
lexicon_it*.json
|
||||
llm_rescue_patch.json
|
||||
treccani_rescue_patch.json
|
||||
to_be_review*.json
|
||||
_*.json
|
||||
idee.txt
|
||||
BIN
__pycache__/build_lexicon.cpython-313.pyc
Normal file
BIN
__pycache__/build_lexicon.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/build_semantic_lexicon.cpython-313.pyc
Normal file
BIN
__pycache__/build_semantic_lexicon.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/build_vocabulary.cpython-313.pyc
Normal file
BIN
__pycache__/build_vocabulary.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/crossword_filler.cpython-313.pyc
Normal file
BIN
__pycache__/crossword_filler.cpython-313.pyc
Normal file
Binary file not shown.
BIN
__pycache__/crossword_generator.cpython-313.pyc
Normal file
BIN
__pycache__/crossword_generator.cpython-313.pyc
Normal file
Binary file not shown.
162
apply_llm_rescue_patch.py
Normal file
162
apply_llm_rescue_patch.py
Normal file
@@ -0,0 +1,162 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Tuple
|
||||
|
||||
|
||||
DEFAULT_LEXICON_PATH = Path(__file__).with_name("lexicon_it_curated.json")
|
||||
DEFAULT_PATCH_PATH = Path(__file__).with_name("llm_rescue_patch.json")
|
||||
DEFAULT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated_llm.json")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Applica una patch LLM rescue al lessico curato per produrre un lessico operativo aggiornato."
|
||||
)
|
||||
parser.add_argument("--lexicon", type=Path, default=DEFAULT_LEXICON_PATH, help="Lessico curato di partenza.")
|
||||
parser.add_argument("--patch", type=Path, default=DEFAULT_PATCH_PATH, help="Patch LLM rescue da applicare.")
|
||||
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT_PATH, help="Lessico aggiornato in uscita.")
|
||||
parser.add_argument(
|
||||
"--min-confidence",
|
||||
type=float,
|
||||
default=0.6,
|
||||
help="Confidenza minima per applicare automaticamente una definizione rescue.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-needs-review",
|
||||
action="store_true",
|
||||
help="Applica anche voci marcate needs_human_review=true se superano la soglia di confidenza.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def normalize_key(form: str, pos: str) -> Tuple[str, str]:
|
||||
return (str(form or "").strip().lower(), str(pos or "").strip().upper())
|
||||
|
||||
|
||||
def merge_topics(existing: List[str], incoming: List[str]) -> List[str]:
|
||||
merged: List[str] = []
|
||||
seen = set()
|
||||
for item in list(existing or []) + list(incoming or []):
|
||||
value = str(item).strip()
|
||||
if not value:
|
||||
continue
|
||||
key = value.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(value)
|
||||
return merged
|
||||
|
||||
|
||||
def apply_patch(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
lexicon_payload = load_json(args.lexicon, {"entries": []})
|
||||
patch_payload = load_json(args.patch, {"entries": []})
|
||||
if not isinstance(lexicon_payload, dict):
|
||||
raise ValueError(f"Lessico non valido: {args.lexicon}")
|
||||
lexicon = lexicon_payload.get("entries")
|
||||
if not isinstance(lexicon, list):
|
||||
raise ValueError(f"Lessico non valido: {args.lexicon}")
|
||||
if not isinstance(patch_payload, dict):
|
||||
raise ValueError(f"Patch non valida: {args.patch}")
|
||||
|
||||
patch_entries = patch_payload.get("entries") or []
|
||||
patch_by_key = {}
|
||||
for entry in patch_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
patch_by_key[normalize_key(entry.get("form", ""), entry.get("pos", ""))] = entry
|
||||
|
||||
applied = 0
|
||||
skipped = 0
|
||||
for entry in lexicon:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
patch = patch_by_key.get(normalize_key(entry.get("form", ""), entry.get("pos", "")))
|
||||
if not patch:
|
||||
continue
|
||||
confidence = float(patch.get("confidence", 0.0) or 0.0)
|
||||
needs_review = bool(patch.get("needs_human_review", True))
|
||||
definition = str(patch.get("rescue_definition", "")).strip()
|
||||
if not definition:
|
||||
skipped += 1
|
||||
continue
|
||||
if confidence < float(args.min_confidence):
|
||||
skipped += 1
|
||||
continue
|
||||
if needs_review and not args.include_needs_review:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
entry["preferred_definition"] = definition
|
||||
entry["preferred_source"] = patch.get("rescue_source", "llm_rescue")
|
||||
clue_defs = entry.get("clue_definitions") or {}
|
||||
if not isinstance(clue_defs, dict):
|
||||
clue_defs = {}
|
||||
for level in ("easy", "medium", "hard", "expert"):
|
||||
clue_defs[level] = definition
|
||||
entry["clue_definitions"] = clue_defs
|
||||
|
||||
entry["topics"] = merge_topics(entry.get("topics", []), patch.get("rescue_topics", []))
|
||||
entry["semantic_tags"] = merge_topics(entry.get("semantic_tags", []), patch.get("rescue_semantic_tags", []))
|
||||
entry["alpha_ready"] = True
|
||||
review_reasons = [reason for reason in (entry.get("review_reasons") or []) if reason != "no_viable_definition"]
|
||||
if not args.include_needs_review:
|
||||
review_reasons = [reason for reason in review_reasons if reason != "flagged_by_refined_stage"]
|
||||
entry["review_reasons"] = review_reasons
|
||||
entry["llm_rescue"] = {
|
||||
"definition": definition,
|
||||
"source": patch.get("rescue_source", "llm_rescue"),
|
||||
"topics": patch.get("rescue_topics", []),
|
||||
"semantic_tags": patch.get("rescue_semantic_tags", []),
|
||||
"notes": patch.get("rescue_notes", ""),
|
||||
"confidence": confidence,
|
||||
"needs_human_review": needs_review,
|
||||
"status": patch.get("status", ""),
|
||||
}
|
||||
applied += 1
|
||||
|
||||
meta = dict(lexicon_payload.get("meta") or {})
|
||||
meta["base_lexicon"] = args.lexicon.name
|
||||
meta["generated_from_patch"] = args.patch.name
|
||||
meta["generated_by"] = "apply_llm_rescue_patch.py"
|
||||
meta["entry_count"] = len(lexicon)
|
||||
meta["llm_rescue_applied"] = applied
|
||||
meta["llm_rescue_skipped"] = skipped
|
||||
meta["alpha_ready_count"] = sum(1 for item in lexicon if isinstance(item, dict) and item.get("alpha_ready"))
|
||||
meta["review_count"] = sum(
|
||||
1
|
||||
for item in lexicon
|
||||
if isinstance(item, dict) and (item.get("review_reasons") or item.get("needs_review"))
|
||||
)
|
||||
output_payload = {"meta": meta, "entries": lexicon}
|
||||
write_json(args.output, output_payload)
|
||||
return {
|
||||
"applied": applied,
|
||||
"skipped": skipped,
|
||||
"output": str(args.output),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = apply_patch(args)
|
||||
print(f"Lessico aggiornato generato: {result['output']}")
|
||||
print(f"Patch applicate: {result['applied']}")
|
||||
print(f"Voci saltate: {result['skipped']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
490
babelnet_daily_batch.py
Normal file
490
babelnet_daily_batch.py
Normal file
@@ -0,0 +1,490 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from babelnet_incremental_enricher import (
|
||||
DEFAULT_TOPIC,
|
||||
merge_babelnet_entries,
|
||||
rebuild_enriched,
|
||||
)
|
||||
from build_babelnet_enrichment import (
|
||||
BABELNET_CACHE_PATH,
|
||||
BABELNET_ENV_KEY,
|
||||
BABELNET_OUTPUT_PATH,
|
||||
BabelNetApiCallLimitReached,
|
||||
BabelNetKeyUnavailable,
|
||||
POS_TO_BABELNET,
|
||||
enrich_entry,
|
||||
load_babelnet_api_keys,
|
||||
load_json,
|
||||
write_json,
|
||||
)
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
LOG_DIR = Path(__file__).with_name("logs")
|
||||
DEFAULT_API_CALL_LIMIT = 950
|
||||
DEFAULT_PER_KEY_API_CALL_LIMIT = 950
|
||||
DEFAULT_WORD_LIMIT = 10_000
|
||||
MIN_WORD_LENGTH = 3
|
||||
MAX_WORD_LENGTH = 16
|
||||
USEFUL_POS_PRIORITY = {
|
||||
"NOUN": 6,
|
||||
"VERB": 5,
|
||||
"ADJ": 4,
|
||||
"ADV": 3,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Batch giornaliero per fondere progressivamente ItalWordNet e BabelNet: "
|
||||
"arricchisce parole mancanti, aggiorna lexicon_it_babelnet.json e rigenera lexicon_it_enriched.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-call-limit",
|
||||
type=int,
|
||||
default=DEFAULT_API_CALL_LIMIT,
|
||||
help="Numero massimo complessivo di chiamate API BabelNet reali consentite in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--per-key-api-call-limit",
|
||||
type=int,
|
||||
default=DEFAULT_PER_KEY_API_CALL_LIMIT,
|
||||
help="Numero massimo di chiamate API reali consentite per ciascuna chiave caricata.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token-index",
|
||||
default=None,
|
||||
help="Usa una o piu chiavi locali, contando da 1. Esempi: --token-index 2 oppure --token-index 1,2,3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token-indexes",
|
||||
default=None,
|
||||
help="Alias esplicito per una lista di chiavi locali. Esempio: --token-indexes 1,2,3.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=DEFAULT_WORD_LIMIT,
|
||||
help="Numero massimo di parole candidate da tentare in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa tra richieste API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale per concentrare il batch su una parte del lessico.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--include-not-crossword",
|
||||
action="store_true",
|
||||
help="Include anche voci non marcate allowed_in_crossword.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-no-match",
|
||||
action="store_true",
|
||||
help="Riprova anche parole gia marcate come no_match.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Mostra le prossime parole candidate senza chiamare BabelNet e senza scrivere file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-cache",
|
||||
action="store_true",
|
||||
help="Ignora la cache in questa esecuzione diagnostica, utile per testare un token specifico.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="Archivio incrementale degli arricchimenti BabelNet.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enriched",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico fuso da rigenerare dopo il batch.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]:
|
||||
if enriched_path.exists():
|
||||
payload = load_json(enriched_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
payload = load_json(semantic_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}")
|
||||
|
||||
|
||||
def babelnet_status(entry: Dict[str, object]) -> str:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
return str(babelnet.get("status", "not_requested"))
|
||||
return "not_requested"
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> set[str]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) or [] if item}
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
topics.update(str(item).lower() for item in semantic.get("semantic_topics", []) or [] if item)
|
||||
return topics
|
||||
|
||||
|
||||
def eligible_entry(entry: Dict[str, object], args: argparse.Namespace) -> bool:
|
||||
word = str(entry.get("form", "")).strip().lower()
|
||||
pos = str(entry.get("pos", "")).strip().upper()
|
||||
status = babelnet_status(entry)
|
||||
allowed_statuses = {"not_requested", "api_error"}
|
||||
if args.retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
if status not in allowed_statuses:
|
||||
return False
|
||||
if pos not in POS_TO_BABELNET:
|
||||
return False
|
||||
if not word.isalpha() or not MIN_WORD_LENGTH <= len(word) <= MAX_WORD_LENGTH:
|
||||
return False
|
||||
if not args.include_not_crossword and not entry.get("allowed_in_crossword", False):
|
||||
return False
|
||||
if args.topic and args.topic.strip().lower() not in entry_topics(entry):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def candidate_priority(entry: Dict[str, object]) -> Tuple[int, int, int, int, int, str]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", "")).upper()
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) or []}
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = set()
|
||||
if isinstance(semantic, dict):
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) or []}
|
||||
|
||||
useful_topic_bonus = 2 if topics - {DEFAULT_TOPIC, "abstract", "actions"} else 0
|
||||
semantic_topic_bonus = 1 if semantic_topics else 0
|
||||
length_bonus = 3 if 4 <= len(word) <= 11 else 1
|
||||
return (
|
||||
useful_topic_bonus,
|
||||
semantic_topic_bonus,
|
||||
int(entry.get("quality_score", 0)),
|
||||
USEFUL_POS_PRIORITY.get(pos, 0),
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
|
||||
def select_candidates(payload: Dict[str, object], args: argparse.Namespace) -> List[Dict[str, object]]:
|
||||
candidates = [
|
||||
entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and eligible_entry(entry, args)
|
||||
]
|
||||
candidates.sort(key=candidate_priority, reverse=True)
|
||||
return candidates[: max(0, args.word_limit)]
|
||||
|
||||
|
||||
def progress_counts(payload: Dict[str, object]) -> Dict[str, int]:
|
||||
counts: Dict[str, int] = {}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
status = babelnet_status(entry)
|
||||
counts[status] = counts.get(status, 0) + 1
|
||||
return counts
|
||||
|
||||
|
||||
def parse_token_indexes(value: Optional[str], key_count: int, option_name: str) -> Optional[List[int]]:
|
||||
if value is None:
|
||||
return None
|
||||
|
||||
selected: List[int] = []
|
||||
seen = set()
|
||||
for raw_part in str(value).replace(";", ",").split(","):
|
||||
part = raw_part.strip()
|
||||
if not part:
|
||||
continue
|
||||
try:
|
||||
index = int(part)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(f"{option_name} deve contenere solo numeri separati da virgola.") from exc
|
||||
if not 1 <= index <= key_count:
|
||||
raise SystemExit(
|
||||
f"{option_name} contiene {index}, ma deve essere tra 1 e {key_count}. Chiavi caricate: {key_count}."
|
||||
)
|
||||
if index in seen:
|
||||
continue
|
||||
selected.append(index)
|
||||
seen.add(index)
|
||||
|
||||
if not selected:
|
||||
raise SystemExit(f"{option_name} non contiene nessun indice valido.")
|
||||
return selected
|
||||
|
||||
|
||||
def write_batch_log(payload: Dict[str, object]) -> Path:
|
||||
LOG_DIR.mkdir(exist_ok=True)
|
||||
timestamp = datetime.now().astimezone().strftime("%Y%m%d_%H%M%S")
|
||||
path = LOG_DIR / f"babelnet_batch_{timestamp}.json"
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
return path
|
||||
|
||||
|
||||
def run_batch(args: argparse.Namespace) -> Dict[str, object]:
|
||||
source_payload = load_source_payload(args.enriched, args.semantic)
|
||||
candidates = select_candidates(source_payload, args)
|
||||
before_counts = progress_counts(source_payload)
|
||||
|
||||
if args.dry_run:
|
||||
return {
|
||||
"mode": "dry-run",
|
||||
"candidate_count": len(candidates),
|
||||
"selected_words": [entry.get("form") for entry in candidates[:50]],
|
||||
"before_counts": before_counts,
|
||||
}
|
||||
|
||||
api_keys = load_babelnet_api_keys()
|
||||
if not api_keys:
|
||||
raise SystemExit(
|
||||
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure crea .babelnet_api_key.local."
|
||||
)
|
||||
token_indexes = parse_token_indexes(args.token_index, len(api_keys), "--token-index")
|
||||
token_indexes_alias = parse_token_indexes(args.token_indexes, len(api_keys), "--token-indexes")
|
||||
if token_indexes and token_indexes_alias:
|
||||
raise SystemExit("Usa solo uno tra --token-index e --token-indexes.")
|
||||
selected_token_indexes = token_indexes or token_indexes_alias
|
||||
if selected_token_indexes:
|
||||
api_keys = [api_keys[index - 1] for index in selected_token_indexes]
|
||||
|
||||
cache = {} if args.ignore_cache else load_json(BABELNET_CACHE_PATH, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
global_stats = {
|
||||
"api_calls": 0,
|
||||
"cache_hits": 0,
|
||||
"responses": 0,
|
||||
"api_call_limit": max(0, args.api_call_limit),
|
||||
}
|
||||
per_key_limit = max(0, args.per_key_api_call_limit)
|
||||
key_stats = [
|
||||
{
|
||||
"key_index": selected_token_indexes[index] if selected_token_indexes else index + 1,
|
||||
"local_key_index": index + 1,
|
||||
"api_calls": 0,
|
||||
"cache_hits": 0,
|
||||
"responses": 0,
|
||||
"api_call_limit": per_key_limit,
|
||||
}
|
||||
for index, _ in enumerate(api_keys)
|
||||
]
|
||||
enriched_entries: List[Dict[str, object]] = []
|
||||
word_logs = []
|
||||
stopped_reason = "completed"
|
||||
|
||||
def select_key_index() -> Optional[int]:
|
||||
available = [
|
||||
(stats["api_calls"], index)
|
||||
for index, stats in enumerate(key_stats)
|
||||
if stats["api_calls"] < stats["api_call_limit"]
|
||||
]
|
||||
if not available:
|
||||
return None
|
||||
available.sort()
|
||||
return available[0][1]
|
||||
|
||||
for index, entry in enumerate(candidates, start=1):
|
||||
if global_stats["api_calls"] >= global_stats["api_call_limit"]:
|
||||
stopped_reason = "api_call_limit"
|
||||
break
|
||||
key_index = select_key_index()
|
||||
if key_index is None:
|
||||
stopped_reason = "per_key_api_call_limit"
|
||||
break
|
||||
|
||||
before_api_calls = global_stats["api_calls"]
|
||||
before_cache_hits = global_stats["cache_hits"]
|
||||
before_responses = global_stats["responses"]
|
||||
before_key_api_calls = key_stats[key_index]["api_calls"]
|
||||
before_key_cache_hits = key_stats[key_index]["cache_hits"]
|
||||
before_key_responses = key_stats[key_index]["responses"]
|
||||
|
||||
updated = deepcopy(entry)
|
||||
updated.pop("babelnet", None)
|
||||
try:
|
||||
updated["babelnet"] = enrich_entry(updated, api_keys[key_index], cache, args.sleep, key_stats[key_index])
|
||||
except BabelNetApiCallLimitReached:
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
stopped_reason = "per_key_api_call_limit"
|
||||
break
|
||||
except BabelNetKeyUnavailable as exc:
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
key_stats[key_index]["api_calls"] = key_stats[key_index]["api_call_limit"]
|
||||
word_logs.append(
|
||||
{
|
||||
"index": index,
|
||||
"word": updated.get("form"),
|
||||
"pos": updated.get("pos"),
|
||||
"key_index": key_stats[key_index]["key_index"],
|
||||
"api_calls": global_stats["api_calls"] - before_api_calls,
|
||||
"cache_hits": global_stats["cache_hits"] - before_cache_hits,
|
||||
"responses": global_stats["responses"] - before_responses,
|
||||
"matched": False,
|
||||
"synsets": 0,
|
||||
"reason": "key_unavailable_or_daily_limit",
|
||||
"error": str(exc),
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {updated.get('form')}: "
|
||||
f"token={key_stats[key_index]['key_index']} non disponibile o limite giornaliero raggiunto"
|
||||
)
|
||||
if select_key_index() is None:
|
||||
stopped_reason = "all_keys_unavailable_or_daily_limit"
|
||||
break
|
||||
continue
|
||||
|
||||
global_stats["api_calls"] += key_stats[key_index]["api_calls"] - before_key_api_calls
|
||||
global_stats["cache_hits"] += key_stats[key_index]["cache_hits"] - before_key_cache_hits
|
||||
global_stats["responses"] += key_stats[key_index]["responses"] - before_key_responses
|
||||
|
||||
enriched_entries.append(updated)
|
||||
write_json(BABELNET_CACHE_PATH, cache)
|
||||
|
||||
word_log = {
|
||||
"index": index,
|
||||
"word": updated.get("form"),
|
||||
"pos": updated.get("pos"),
|
||||
"key_index": key_stats[key_index]["key_index"],
|
||||
"api_calls": global_stats["api_calls"] - before_api_calls,
|
||||
"cache_hits": global_stats["cache_hits"] - before_cache_hits,
|
||||
"responses": global_stats["responses"] - before_responses,
|
||||
"matched": bool(updated.get("babelnet", {}).get("matched")),
|
||||
"synsets": len(updated.get("babelnet", {}).get("synsets", []) or []),
|
||||
"reason": updated.get("babelnet", {}).get("reason"),
|
||||
}
|
||||
word_logs.append(word_log)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {word_log['word']}: "
|
||||
f"token={word_log['key_index']} api_calls={word_log['api_calls']} cache_hits={word_log['cache_hits']} "
|
||||
f"match={word_log['matched']} tot_api={global_stats['api_calls']}/{global_stats['api_call_limit']}"
|
||||
)
|
||||
|
||||
merged_babelnet = merge_babelnet_entries(
|
||||
babelnet_payload,
|
||||
enriched_entries,
|
||||
args.topic or "all",
|
||||
"all",
|
||||
)
|
||||
write_json(args.babelnet, merged_babelnet)
|
||||
enriched_payload = rebuild_enriched(
|
||||
args.semantic,
|
||||
args.babelnet,
|
||||
args.enriched,
|
||||
args.topic or DEFAULT_TOPIC,
|
||||
)
|
||||
after_counts = progress_counts(enriched_payload)
|
||||
|
||||
total_entries = int(enriched_payload.get("meta", {}).get("entry_count", 0))
|
||||
covered = total_entries - after_counts.get("not_requested", 0)
|
||||
coverage = covered / total_entries if total_entries else 0.0
|
||||
|
||||
result = {
|
||||
"mode": "batch",
|
||||
"started_topic": args.topic,
|
||||
"stopped_reason": stopped_reason,
|
||||
"candidate_count": len(candidates),
|
||||
"attempted_words": len(enriched_entries),
|
||||
"matched_words": sum(1 for entry in enriched_entries if entry.get("babelnet", {}).get("matched")),
|
||||
"api_calls": global_stats["api_calls"],
|
||||
"cache_hits": global_stats["cache_hits"],
|
||||
"responses": global_stats["responses"],
|
||||
"api_call_limit": global_stats["api_call_limit"],
|
||||
"api_key_count": len(api_keys),
|
||||
"forced_token_indexes": selected_token_indexes,
|
||||
"per_key_api_call_limit": per_key_limit,
|
||||
"per_key_stats": key_stats,
|
||||
"before_counts": before_counts,
|
||||
"after_counts": after_counts,
|
||||
"total_entries": total_entries,
|
||||
"covered_entries": covered,
|
||||
"coverage_ratio": coverage,
|
||||
"word_logs": word_logs,
|
||||
}
|
||||
log_path = write_batch_log(result)
|
||||
result["log_path"] = str(log_path)
|
||||
return result
|
||||
|
||||
|
||||
def print_result(result: Dict[str, object]) -> None:
|
||||
if result["mode"] == "dry-run":
|
||||
print("Dry-run batch BabelNet")
|
||||
print(f"Candidate selezionate: {result['candidate_count']}")
|
||||
print(f"Stati iniziali: {result['before_counts']}")
|
||||
print("Prime parole:")
|
||||
for index, word in enumerate(result["selected_words"], start=1):
|
||||
print(f"{index:>2}. {word}")
|
||||
return
|
||||
|
||||
print("Batch BabelNet completato")
|
||||
print(f"- motivo stop: {result['stopped_reason']}")
|
||||
print(f"- parole tentate: {result['attempted_words']}/{result['candidate_count']}")
|
||||
print(f"- parole con match: {result['matched_words']}")
|
||||
print(f"- chiamate API reali: {result['api_calls']}/{result['api_call_limit']}")
|
||||
print(f"- chiavi caricate: {result['api_key_count']} (limite per chiave: {result['per_key_api_call_limit']})")
|
||||
if result.get("forced_token_indexes"):
|
||||
print(f"- token forzati: {', '.join('#' + str(index) for index in result['forced_token_indexes'])}")
|
||||
for item in result["per_key_stats"]:
|
||||
print(f" chiave #{item['key_index']}: {item['api_calls']}/{item['api_call_limit']} chiamate API")
|
||||
print(f"- cache hit: {result['cache_hits']}")
|
||||
print(f"- copertura lessico: {result['covered_entries']}/{result['total_entries']} ({result['coverage_ratio'] * 100:.1f}%)")
|
||||
print(f"- stati dopo: {result['after_counts']}")
|
||||
print(f"- log: {result['log_path']}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = run_batch(args)
|
||||
print_result(result)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
583
babelnet_incremental_enricher.py
Normal file
583
babelnet_incremental_enricher.py
Normal file
@@ -0,0 +1,583 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from build_babelnet_enrichment import (
|
||||
BABELNET_CACHE_PATH,
|
||||
BABELNET_ENV_KEY,
|
||||
BABELNET_OUTPUT_PATH,
|
||||
POS_TO_BABELNET,
|
||||
enrich_entry,
|
||||
load_json,
|
||||
write_json,
|
||||
)
|
||||
from build_enriched_lexicon import (
|
||||
ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
build_enriched_lexicon,
|
||||
write_json as write_enriched_json,
|
||||
)
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
"easy": 1,
|
||||
"medium": 2,
|
||||
"hard": 4,
|
||||
"expert": 5,
|
||||
}
|
||||
DEFAULT_TOPIC = "general"
|
||||
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
|
||||
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
|
||||
GENERAL_FILL_MIN_QUALITY = 6
|
||||
GENERAL_FILL_MAX_LENGTH = 10
|
||||
SOFT_RELATED_FILL_LIMIT = 120
|
||||
CONCRETE_TOPICS = {
|
||||
"animals",
|
||||
"plants",
|
||||
"nature",
|
||||
"ecology",
|
||||
"geography",
|
||||
"weather",
|
||||
"sea",
|
||||
"mountain",
|
||||
"health",
|
||||
"science",
|
||||
"sport",
|
||||
"history",
|
||||
"school",
|
||||
"cinema",
|
||||
"literature",
|
||||
"food",
|
||||
"city",
|
||||
"transport",
|
||||
"work",
|
||||
"home",
|
||||
}
|
||||
|
||||
TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
|
||||
"bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
|
||||
"trattor", "carr", "vap", "rota", "ruot",
|
||||
),
|
||||
"animals": (
|
||||
"can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
|
||||
"serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
|
||||
),
|
||||
"nature": (
|
||||
"mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
|
||||
"rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
|
||||
"litoral", "piogg", "nev", "onda", "clim",
|
||||
),
|
||||
"cinema": (
|
||||
"film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
|
||||
"comic", "div", "docu", "pellic", "spettacol",
|
||||
),
|
||||
}
|
||||
|
||||
TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"intervist", "intratten", "speriment", "stermin", "investig",
|
||||
"intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
|
||||
"eccit", "traduz", "fluttu", "sollecit",
|
||||
),
|
||||
"animals": (
|
||||
"assicur", "finanz", "coediz", "camerier", "servitor", "indic",
|
||||
"estens", "diffus", "difensor", "spessor", "maggior",
|
||||
),
|
||||
"cinema": (
|
||||
"manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
|
||||
"malumor", "eversor",
|
||||
),
|
||||
}
|
||||
|
||||
ENRICHABLE_STATUSES = {"not_requested", "api_error"}
|
||||
|
||||
BABELNET_TOPIC_SAFE_PREFIXES: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"ambul",
|
||||
"aer",
|
||||
"autobus",
|
||||
"autocar",
|
||||
"automob",
|
||||
"autostrad",
|
||||
"autoveic",
|
||||
"autovett",
|
||||
"bicicl",
|
||||
"ciclo",
|
||||
"imbarc",
|
||||
"locom",
|
||||
"motoc",
|
||||
"motr",
|
||||
"navig",
|
||||
"rimorch",
|
||||
"trattor",
|
||||
"tren",
|
||||
"veicol",
|
||||
"vettur",
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def parse_difficulty(value: str) -> int:
|
||||
text = str(value).strip().lower()
|
||||
if text in DIFFICULTY_ALIASES:
|
||||
return DIFFICULTY_ALIASES[text]
|
||||
try:
|
||||
level = int(text)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(
|
||||
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
|
||||
) from exc
|
||||
if not 1 <= level <= 5:
|
||||
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
|
||||
return level
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce incrementalmente il lessico: seleziona parole mancanti, "
|
||||
"chiama BabelNet entro un limite e rigenera lexicon_it_enriched.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=os.environ.get(BABELNET_ENV_KEY),
|
||||
help=f"Chiave API BabelNet. In alternativa imposta la variabile ambiente {BABELNET_ENV_KEY}.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=DEFAULT_TOPIC,
|
||||
help="Topic per cui scegliere le prossime parole da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--difficulty",
|
||||
default="medium",
|
||||
help="Difficolta massima: easy, medium, hard, expert oppure 1-5.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Numero massimo di parole da arricchire in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa tra richieste API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="Archivio degli arricchimenti BabelNet parziali.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enriched",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico arricchito da aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Mostra le parole candidate senza chiamare BabelNet e senza scrivere file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-no-match",
|
||||
action="store_true",
|
||||
help="Riprova anche parole gia marcate come no_match.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
nargs="*",
|
||||
default=None,
|
||||
help="Parole specifiche da arricchire, utile per generare definizioni sul cruciverba finale.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def dedupe(items: Iterable[Dict[str, object]]) -> List[Dict[str, object]]:
|
||||
seen = set()
|
||||
result = []
|
||||
for item in items:
|
||||
key = entry_key(item)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> Tuple[set[str], set[str]]:
|
||||
topics = {str(item).lower() for item in entry.get("topics", []) if item}
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = set()
|
||||
if isinstance(semantic, dict):
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", []) if item}
|
||||
return topics, semantic_topics
|
||||
|
||||
|
||||
def current_babelnet_status(entry: Dict[str, object]) -> str:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
return str(babelnet.get("status", "not_requested"))
|
||||
return "not_requested"
|
||||
|
||||
|
||||
def matches_topic_roots(word: str, topic: str) -> bool:
|
||||
roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(topic, ())
|
||||
return not roots or any(root in word for root in roots)
|
||||
|
||||
|
||||
def matches_safe_babelnet_roots(word: str, topic: str) -> bool:
|
||||
prefixes = BABELNET_TOPIC_SAFE_PREFIXES.get(topic)
|
||||
if prefixes is None:
|
||||
return False
|
||||
return any(word.startswith(prefix) for prefix in prefixes)
|
||||
|
||||
|
||||
def is_blocked_for_topic(word: str, topic: str) -> bool:
|
||||
return any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(topic, ()))
|
||||
|
||||
|
||||
def topic_score(entry: Dict[str, object], topic: str) -> int:
|
||||
if topic == DEFAULT_TOPIC:
|
||||
return 20
|
||||
|
||||
word = str(entry.get("form", "")).lower()
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
score = 0
|
||||
if topic in topics:
|
||||
score += 100
|
||||
if topic in semantic_topics:
|
||||
score += 45
|
||||
if matches_topic_roots(word, topic):
|
||||
score += 35
|
||||
if DEFAULT_TOPIC in topics:
|
||||
score += 5
|
||||
if is_blocked_for_topic(word, topic):
|
||||
score -= 100
|
||||
if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
score -= 30
|
||||
return score
|
||||
|
||||
|
||||
def candidate_score(entry: Dict[str, object], topic: str) -> Tuple[int, int, int, int, int, str]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = str(entry.get("pos", ""))
|
||||
pos_bonus = {
|
||||
"NOUN": 12,
|
||||
"VERB": 8,
|
||||
"ADJ": 6,
|
||||
"ADV": 4,
|
||||
}.get(pos, 0)
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_bonus = 3 if isinstance(semantic, dict) and semantic.get("matched") else 0
|
||||
length_bonus = 4 if 4 <= len(word) <= 10 else 1 if len(word) <= 14 else -3
|
||||
return (
|
||||
topic_score(entry, topic),
|
||||
int(entry.get("quality_score", 0)),
|
||||
pos_bonus,
|
||||
semantic_bonus,
|
||||
length_bonus,
|
||||
word,
|
||||
)
|
||||
|
||||
|
||||
def eligible_for_babelnet(entry: Dict[str, object], topic: str, difficulty_level: int, retry_no_match: bool) -> bool:
|
||||
word = str(entry.get("form", "")).lower()
|
||||
pos = str(entry.get("pos", ""))
|
||||
topics, semantic_topics = entry_topics(entry)
|
||||
status = current_babelnet_status(entry)
|
||||
allowed_statuses = set(ENRICHABLE_STATUSES)
|
||||
if retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
if status not in allowed_statuses:
|
||||
return False
|
||||
if not word.isalpha() or len(word) < 3 or len(word) > 16:
|
||||
return False
|
||||
if pos not in POS_TO_BABELNET or pos not in FILL_ALLOWED_POS:
|
||||
return False
|
||||
if int(entry.get("difficulty_word", 5)) > difficulty_level:
|
||||
return False
|
||||
if not entry.get("allowed_in_crossword", False):
|
||||
return False
|
||||
if topic != DEFAULT_TOPIC:
|
||||
if topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
|
||||
return False
|
||||
conservative_match = topic in topics
|
||||
safe_root_match = matches_safe_babelnet_roots(word, topic)
|
||||
semantic_only_match = topic in semantic_topics and topic not in CONCRETE_TOPICS
|
||||
if not (conservative_match or safe_root_match or semantic_only_match):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def select_candidates(payload: Dict[str, object], topic: str, difficulty_level: int, limit: int, retry_no_match: bool) -> List[Dict[str, object]]:
|
||||
entries = [
|
||||
entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and eligible_for_babelnet(entry, topic, difficulty_level, retry_no_match)
|
||||
]
|
||||
|
||||
if topic != DEFAULT_TOPIC:
|
||||
strong = [entry for entry in entries if topic in entry_topics(entry)[0]]
|
||||
soft = [
|
||||
entry
|
||||
for entry in entries
|
||||
if entry not in strong
|
||||
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||
and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
|
||||
]
|
||||
support = [
|
||||
entry
|
||||
for entry in entries
|
||||
if entry not in strong
|
||||
and entry not in soft
|
||||
and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
|
||||
and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
|
||||
]
|
||||
entries = strong + sorted(soft, key=lambda item: candidate_score(item, topic), reverse=True)[:SOFT_RELATED_FILL_LIMIT]
|
||||
entries += sorted(support, key=lambda item: candidate_score(item, topic), reverse=True)
|
||||
|
||||
entries = dedupe(entries)
|
||||
entries.sort(key=lambda item: candidate_score(item, topic), reverse=True)
|
||||
return entries[:limit]
|
||||
|
||||
|
||||
def select_word_candidates(
|
||||
payload: Dict[str, object],
|
||||
words: Iterable[str],
|
||||
limit: int,
|
||||
retry_no_match: bool,
|
||||
) -> List[Dict[str, object]]:
|
||||
requested = []
|
||||
seen_words = set()
|
||||
for word in words:
|
||||
normalized = str(word).strip().lower()
|
||||
if normalized and normalized not in seen_words:
|
||||
requested.append(normalized)
|
||||
seen_words.add(normalized)
|
||||
|
||||
by_word = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
selected = []
|
||||
allowed_statuses = set(ENRICHABLE_STATUSES)
|
||||
if retry_no_match:
|
||||
allowed_statuses.add("no_match")
|
||||
|
||||
for word in requested:
|
||||
entry = by_word.get(word)
|
||||
if not entry:
|
||||
continue
|
||||
status = current_babelnet_status(entry)
|
||||
if status not in allowed_statuses:
|
||||
continue
|
||||
if str(entry.get("pos", "")) not in POS_TO_BABELNET:
|
||||
continue
|
||||
if not str(entry.get("form", "")).isalpha():
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def load_source_payload(enriched_path: Path, semantic_path: Path) -> Dict[str, object]:
|
||||
if enriched_path.exists():
|
||||
payload = load_json(enriched_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
payload = load_json(semantic_path, {})
|
||||
if isinstance(payload, dict) and "entries" in payload:
|
||||
return payload
|
||||
raise ValueError(f"Nessun lessico valido trovato: {enriched_path} / {semantic_path}")
|
||||
|
||||
|
||||
def merge_babelnet_entries(existing_payload: Dict[str, object], new_entries: List[Dict[str, object]], topic: str, difficulty: str) -> Dict[str, object]:
|
||||
existing_entries = [
|
||||
entry for entry in existing_payload.get("entries", []) or [] if isinstance(entry, dict)
|
||||
]
|
||||
index = {entry_key(entry): deepcopy(entry) for entry in existing_entries}
|
||||
generated_at = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
|
||||
for entry in new_entries:
|
||||
updated = deepcopy(entry)
|
||||
updated["babelnet_generated_at"] = generated_at
|
||||
index[entry_key(updated)] = updated
|
||||
|
||||
entries = sorted(index.values(), key=lambda item: (str(item.get("form", "")), str(item.get("pos", ""))))
|
||||
meta = dict(existing_payload.get("meta", {})) if isinstance(existing_payload.get("meta", {}), dict) else {}
|
||||
meta.update(
|
||||
{
|
||||
"language": meta.get("language", "it"),
|
||||
"version": max(1, int(meta.get("version", 1))),
|
||||
"source": "BabelNet API",
|
||||
"updated_at": generated_at,
|
||||
"last_topic": topic,
|
||||
"last_difficulty": difficulty,
|
||||
"entry_count": len(entries),
|
||||
}
|
||||
)
|
||||
return {"meta": meta, "entries": entries}
|
||||
|
||||
|
||||
def rebuild_enriched(semantic_path: Path, babelnet_path: Path, enriched_path: Path, topic: str) -> Dict[str, object]:
|
||||
namespace = SimpleNamespace(
|
||||
semantic=semantic_path,
|
||||
babelnet=babelnet_path,
|
||||
output=enriched_path,
|
||||
topic=topic,
|
||||
)
|
||||
payload = build_enriched_lexicon(namespace)
|
||||
write_enriched_json(enriched_path, payload)
|
||||
return payload
|
||||
|
||||
|
||||
def run_incremental_enrichment(args: argparse.Namespace) -> Dict[str, object]:
|
||||
normalized_topic = args.topic.strip().lower()
|
||||
difficulty_level = parse_difficulty(str(args.difficulty))
|
||||
source_payload = load_source_payload(args.enriched, args.semantic)
|
||||
target_words = getattr(args, "words", None)
|
||||
if target_words:
|
||||
candidates = select_word_candidates(
|
||||
source_payload,
|
||||
target_words,
|
||||
max(0, args.limit),
|
||||
args.retry_no_match,
|
||||
)
|
||||
else:
|
||||
candidates = select_candidates(
|
||||
source_payload,
|
||||
normalized_topic,
|
||||
difficulty_level,
|
||||
max(0, args.limit),
|
||||
args.retry_no_match,
|
||||
)
|
||||
|
||||
if args.dry_run:
|
||||
return {
|
||||
"mode": "dry-run",
|
||||
"topic": normalized_topic,
|
||||
"difficulty": args.difficulty,
|
||||
"selected_count": len(candidates),
|
||||
"selected_words": [entry.get("form") for entry in candidates],
|
||||
}
|
||||
|
||||
if not args.api_key:
|
||||
raise SystemExit(
|
||||
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
|
||||
)
|
||||
|
||||
cache = load_json(BABELNET_CACHE_PATH, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
enriched_candidates = []
|
||||
word_logs = []
|
||||
for index, entry in enumerate(candidates, start=1):
|
||||
updated = deepcopy(entry)
|
||||
updated.pop("babelnet", None)
|
||||
stats = {"api_calls": 0, "cache_hits": 0, "responses": 0}
|
||||
updated["babelnet"] = enrich_entry(updated, args.api_key, cache, args.sleep, stats)
|
||||
enriched_candidates.append(updated)
|
||||
write_json(BABELNET_CACHE_PATH, cache)
|
||||
word_logs.append(
|
||||
{
|
||||
"word": updated["form"],
|
||||
"api_calls": stats["api_calls"],
|
||||
"cache_hits": stats["cache_hits"],
|
||||
"responses": stats["responses"],
|
||||
"matched": bool(updated["babelnet"].get("matched")),
|
||||
"synsets": len(updated["babelnet"].get("synsets", []) or []),
|
||||
"reason": updated["babelnet"].get("reason"),
|
||||
}
|
||||
)
|
||||
print(
|
||||
f"[{index}/{len(candidates)}] {updated['form']}: "
|
||||
f"api_calls={stats['api_calls']} cache_hits={stats['cache_hits']} "
|
||||
f"risposta={stats['responses'] > 0} match={updated['babelnet'].get('matched')}"
|
||||
)
|
||||
|
||||
merged_babelnet = merge_babelnet_entries(
|
||||
babelnet_payload,
|
||||
enriched_candidates,
|
||||
normalized_topic,
|
||||
str(args.difficulty),
|
||||
)
|
||||
write_json(args.babelnet, merged_babelnet)
|
||||
enriched_payload = rebuild_enriched(args.semantic, args.babelnet, args.enriched, normalized_topic)
|
||||
|
||||
return {
|
||||
"mode": "enriched",
|
||||
"topic": normalized_topic,
|
||||
"difficulty": args.difficulty,
|
||||
"selected_count": len(candidates),
|
||||
"matched_count": sum(1 for entry in enriched_candidates if entry.get("babelnet", {}).get("matched")),
|
||||
"api_call_count": sum(item["api_calls"] for item in word_logs),
|
||||
"cache_hit_count": sum(item["cache_hits"] for item in word_logs),
|
||||
"word_logs": word_logs,
|
||||
"babelnet_entry_count": merged_babelnet["meta"]["entry_count"],
|
||||
"enriched_status_counts": enriched_payload["meta"]["babelnet_status_counts"],
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = run_incremental_enrichment(args)
|
||||
if result["mode"] == "dry-run":
|
||||
print("Dry-run BabelNet incrementale")
|
||||
print(f"Topic: {result['topic']}")
|
||||
print(f"Difficolta: {result['difficulty']}")
|
||||
print(f"Parole selezionate: {result['selected_count']}")
|
||||
for index, word in enumerate(result["selected_words"], start=1):
|
||||
print(f"{index:2d}. {word}")
|
||||
return
|
||||
|
||||
print("Arricchimento BabelNet completato")
|
||||
print(f"Topic: {result['topic']}")
|
||||
print(f"Parole interrogate: {result['selected_count']}")
|
||||
print(f"Chiamate API BabelNet reali: {result['api_call_count']}")
|
||||
print(f"Risposte da cache: {result['cache_hit_count']}")
|
||||
print(f"Match BabelNet: {result['matched_count']}")
|
||||
for item in result["word_logs"]:
|
||||
print(
|
||||
f"- {item['word']}: api_calls={item['api_calls']}, "
|
||||
f"cache_hits={item['cache_hits']}, risposta={item['responses'] > 0}, "
|
||||
f"match={item['matched']}, synsets={item['synsets']}"
|
||||
)
|
||||
print(f"Voci BabelNet archiviate: {result['babelnet_entry_count']}")
|
||||
print(f"Stati lessico arricchito: {result['enriched_status_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
385
build_babelnet_enrichment.py
Normal file
385
build_babelnet_enrichment.py
Normal file
@@ -0,0 +1,385 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
BABELNET_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_babelnet.json")
|
||||
BABELNET_CACHE_PATH = Path(__file__).with_name(".babelnet_cache.json")
|
||||
BABELNET_LOCAL_KEY_PATH = Path(__file__).with_name(".babelnet_api_key.local")
|
||||
BABELNET_API_BASE = "https://babelnet.io/v9"
|
||||
BABELNET_ENV_KEY = "BABELNET_API_KEY"
|
||||
|
||||
POS_TO_BABELNET = {
|
||||
"NOUN": "NOUN",
|
||||
"VERB": "VERB",
|
||||
"ADJ": "ADJECTIVE",
|
||||
"ADV": "ADVERB",
|
||||
}
|
||||
|
||||
|
||||
class BabelNetApiCallLimitReached(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class BabelNetKeyUnavailable(RuntimeError):
|
||||
pass
|
||||
|
||||
DIFFICULTY_ALIASES: Dict[str, int] = {
|
||||
"easy": 1,
|
||||
"medium": 2,
|
||||
"hard": 4,
|
||||
"expert": 5,
|
||||
}
|
||||
|
||||
|
||||
def parse_difficulty(value: str) -> int:
|
||||
text = str(value).strip().lower()
|
||||
if text in DIFFICULTY_ALIASES:
|
||||
return DIFFICULTY_ALIASES[text]
|
||||
try:
|
||||
level = int(text)
|
||||
except ValueError as exc:
|
||||
raise SystemExit(
|
||||
"Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
|
||||
) from exc
|
||||
if not 1 <= level <= 5:
|
||||
raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
|
||||
return level
|
||||
|
||||
|
||||
def _split_api_keys(text: str) -> List[str]:
|
||||
keys = []
|
||||
seen = set()
|
||||
normalized = text.replace(";", "\n").replace(",", "\n")
|
||||
for line in normalized.splitlines():
|
||||
key = line.strip()
|
||||
if not key or key.startswith("#") or key in seen:
|
||||
continue
|
||||
keys.append(key)
|
||||
seen.add(key)
|
||||
return keys
|
||||
|
||||
|
||||
def load_babelnet_api_keys() -> List[str]:
|
||||
env_key = os.environ.get(BABELNET_ENV_KEY)
|
||||
if env_key:
|
||||
return _split_api_keys(env_key)
|
||||
if BABELNET_LOCAL_KEY_PATH.exists():
|
||||
return _split_api_keys(BABELNET_LOCAL_KEY_PATH.read_text(encoding="utf-8"))
|
||||
return []
|
||||
|
||||
|
||||
def load_babelnet_api_key() -> Optional[str]:
|
||||
keys = load_babelnet_api_keys()
|
||||
if keys:
|
||||
return keys[0]
|
||||
return None
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Arricchisce lexicon_it_semantic.json usando BabelNet, se disponibile una API key."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key",
|
||||
default=load_babelnet_api_key(),
|
||||
help=(
|
||||
f"Chiave API BabelNet. In alternativa imposta {BABELNET_ENV_KEY} "
|
||||
f"o crea {BABELNET_LOCAL_KEY_PATH.name}."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale da usare per limitare le voci da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--difficulty",
|
||||
default="medium",
|
||||
help="Difficolta massima delle voci da arricchire: easy, medium, hard, expert oppure 1-5.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Numero massimo di lemmi da interrogare in questa esecuzione.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Pausa tra richieste API, utile per non stressare il servizio.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="File JSON di output.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def cache_key(endpoint: str, params: Dict[str, str]) -> str:
|
||||
safe_params = {key: value for key, value in params.items() if key != "key"}
|
||||
return f"{endpoint}?{urllib.parse.urlencode(sorted(safe_params.items()))}"
|
||||
|
||||
|
||||
def request_json(
|
||||
endpoint: str,
|
||||
params: Dict[str, str],
|
||||
cache: Dict[str, object],
|
||||
stats: Optional[Dict[str, int]] = None,
|
||||
) -> object:
|
||||
url = f"{BABELNET_API_BASE}/{endpoint}?{urllib.parse.urlencode(params)}"
|
||||
key = cache_key(endpoint, params)
|
||||
if key in cache:
|
||||
if stats is not None:
|
||||
stats["cache_hits"] = stats.get("cache_hits", 0) + 1
|
||||
return cache[key]
|
||||
|
||||
if stats is not None:
|
||||
limit = stats.get("api_call_limit")
|
||||
current = stats.get("api_calls", 0)
|
||||
if limit is not None and current >= limit:
|
||||
raise BabelNetApiCallLimitReached("Limite chiamate API BabelNet raggiunto")
|
||||
|
||||
request = urllib.request.Request(url, headers={"Accept": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
if exc.code == 403:
|
||||
raise BabelNetKeyUnavailable(f"Chiave BabelNet non valida o limite giornaliero raggiunto: {detail}") from exc
|
||||
raise RuntimeError(f"Errore BabelNet HTTP {exc.code}: {detail}") from exc
|
||||
|
||||
cache[key] = payload
|
||||
if stats is not None:
|
||||
stats["api_calls"] = stats.get("api_calls", 0) + 1
|
||||
stats["responses"] = stats.get("responses", 0) + 1
|
||||
return payload
|
||||
|
||||
|
||||
def entry_topics(entry: Dict[str, object]) -> set[str]:
|
||||
return {str(item).lower() for item in entry.get("topics", [])}
|
||||
|
||||
|
||||
def select_entries(payload: Dict[str, object], topic: Optional[str], difficulty_level: int, limit: int) -> List[Dict[str, object]]:
|
||||
selected = []
|
||||
normalized_topic = topic.strip().lower() if topic else None
|
||||
|
||||
for entry in payload.get("entries", []):
|
||||
word = str(entry.get("form", ""))
|
||||
if not word or not word.isalpha():
|
||||
continue
|
||||
if len(word) < 3 or len(word) > 16:
|
||||
continue
|
||||
if int(entry.get("difficulty_word", 5)) > difficulty_level:
|
||||
continue
|
||||
if str(entry.get("pos", "")) not in POS_TO_BABELNET:
|
||||
continue
|
||||
if normalized_topic and normalized_topic not in entry_topics(entry):
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
|
||||
return selected
|
||||
|
||||
|
||||
def compact_synset_id(payload: Dict[str, object]) -> Dict[str, object]:
|
||||
return {
|
||||
"id": payload.get("id"),
|
||||
"pos": payload.get("pos"),
|
||||
"source": payload.get("source"),
|
||||
}
|
||||
|
||||
|
||||
def extract_glosses(payload: Dict[str, object]) -> List[str]:
|
||||
glosses = []
|
||||
for item in payload.get("glosses", []) or []:
|
||||
language = str(item.get("language", "")).upper()
|
||||
gloss = str(item.get("gloss", "")).strip()
|
||||
if gloss and language in {"IT", "ITA", ""}:
|
||||
glosses.append(gloss)
|
||||
return dedupe(glosses)[:5]
|
||||
|
||||
|
||||
def extract_senses(payload: Dict[str, object]) -> List[str]:
|
||||
senses = []
|
||||
for item in payload.get("senses", []) or []:
|
||||
language = str(item.get("language", "")).upper()
|
||||
lemma = str(item.get("properties", {}).get("simpleLemma") or item.get("fullLemma") or "").strip()
|
||||
if lemma and language in {"IT", "ITA", ""}:
|
||||
senses.append(lemma.replace("_", " "))
|
||||
return dedupe(senses)[:20]
|
||||
|
||||
|
||||
def extract_categories(payload: Dict[str, object]) -> List[str]:
|
||||
categories = []
|
||||
for item in payload.get("categories", []) or []:
|
||||
category = str(item.get("category", "")).strip()
|
||||
if category:
|
||||
categories.append(category)
|
||||
return dedupe(categories)[:20]
|
||||
|
||||
|
||||
def extract_domains(payload: Dict[str, object]) -> List[str]:
|
||||
domains = payload.get("domains", [])
|
||||
if isinstance(domains, dict):
|
||||
return sorted(str(key) for key, value in domains.items() if value)
|
||||
if isinstance(domains, list):
|
||||
return dedupe(str(item) for item in domains if item)[:20]
|
||||
return []
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
seen = set()
|
||||
result = []
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def enrich_entry(
|
||||
entry: Dict[str, object],
|
||||
api_key: str,
|
||||
cache: Dict[str, object],
|
||||
sleep_seconds: float,
|
||||
stats: Optional[Dict[str, int]] = None,
|
||||
) -> Dict[str, object]:
|
||||
word = str(entry.get("form", ""))
|
||||
pos = POS_TO_BABELNET.get(str(entry.get("pos", "")))
|
||||
if not pos:
|
||||
return {"matched": False, "reason": "unsupported_pos", "synsets": []}
|
||||
|
||||
synset_ids = request_json(
|
||||
"getSynsetIds",
|
||||
{
|
||||
"lemma": word,
|
||||
"searchLang": "IT",
|
||||
"pos": pos,
|
||||
"key": api_key,
|
||||
},
|
||||
cache,
|
||||
stats,
|
||||
)
|
||||
if sleep_seconds:
|
||||
time.sleep(sleep_seconds)
|
||||
|
||||
if not isinstance(synset_ids, list) or not synset_ids:
|
||||
return {"matched": False, "reason": "no_synsets", "synsets": []}
|
||||
|
||||
synsets = []
|
||||
for synset_ref in synset_ids[:3]:
|
||||
synset_id = synset_ref.get("id") if isinstance(synset_ref, dict) else str(synset_ref)
|
||||
if not synset_id:
|
||||
continue
|
||||
synset_payload = request_json(
|
||||
"getSynset",
|
||||
{
|
||||
"id": synset_id,
|
||||
"targetLang": "IT",
|
||||
"key": api_key,
|
||||
},
|
||||
cache,
|
||||
stats,
|
||||
)
|
||||
if sleep_seconds:
|
||||
time.sleep(sleep_seconds)
|
||||
if not isinstance(synset_payload, dict):
|
||||
continue
|
||||
synsets.append(
|
||||
{
|
||||
"id": synset_id,
|
||||
"senses": extract_senses(synset_payload),
|
||||
"glosses": extract_glosses(synset_payload),
|
||||
"categories": extract_categories(synset_payload),
|
||||
"domains": extract_domains(synset_payload),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"matched": bool(synsets),
|
||||
"synset_refs": [compact_synset_id(item) for item in synset_ids[:5] if isinstance(item, dict)],
|
||||
"synsets": synsets,
|
||||
}
|
||||
|
||||
|
||||
def build_babelnet_enrichment(args: argparse.Namespace) -> Dict[str, object]:
|
||||
if not args.api_key:
|
||||
raise SystemExit(
|
||||
f"Chiave BabelNet mancante. Imposta {BABELNET_ENV_KEY} oppure usa --api-key <chiave>."
|
||||
)
|
||||
if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
|
||||
raise FileNotFoundError(f"Lessico semantico non trovato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
||||
|
||||
payload = load_json(SEMANTIC_LEXICON_OUTPUT_PATH, {})
|
||||
cache = load_json(BABELNET_CACHE_PATH, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
|
||||
difficulty_level = parse_difficulty(str(args.difficulty))
|
||||
selected_entries = select_entries(payload, args.topic, difficulty_level, args.limit)
|
||||
enriched_entries = []
|
||||
|
||||
for index, entry in enumerate(selected_entries, start=1):
|
||||
enriched = dict(entry)
|
||||
enriched["babelnet"] = enrich_entry(enriched, args.api_key, cache, args.sleep)
|
||||
enriched_entries.append(enriched)
|
||||
print(f"[{index}/{len(selected_entries)}] {entry['form']}: {enriched['babelnet'].get('matched')}")
|
||||
write_json(BABELNET_CACHE_PATH, cache)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": SEMANTIC_LEXICON_OUTPUT_PATH.name,
|
||||
"source": "BabelNet API",
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"topic": args.topic,
|
||||
"difficulty": args.difficulty,
|
||||
"requested_limit": args.limit,
|
||||
"entry_count": len(enriched_entries),
|
||||
},
|
||||
"entries": enriched_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_babelnet_enrichment(args)
|
||||
write_json(args.output, payload)
|
||||
matched = sum(1 for entry in payload["entries"] if entry.get("babelnet", {}).get("matched"))
|
||||
print(f"Lessico BabelNet generato: {args.output}")
|
||||
print(f"Voci arricchite: {payload['meta']['entry_count']}")
|
||||
print(f"Voci con match BabelNet: {matched}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
324
build_enriched_lexicon.py
Normal file
324
build_enriched_lexicon.py
Normal file
@@ -0,0 +1,324 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
from build_babelnet_enrichment import BABELNET_OUTPUT_PATH
|
||||
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
ENRICHED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_enriched.json")
|
||||
|
||||
TOPIC_DOMAIN_RULES: Dict[str, Dict[str, Tuple[str, ...]]] = {
|
||||
"transport": {
|
||||
"strong": (
|
||||
"TRANSPORT_AND_TRAVEL",
|
||||
"NAVIGATION_AND_AVIATION",
|
||||
),
|
||||
"weak": (
|
||||
"CRAFT_ENGINEERING_AND_TECHNOLOGY",
|
||||
"FARMING_FISHING_AND_HUNTING",
|
||||
),
|
||||
"negative": (
|
||||
"MEDIA_AND_PRESS",
|
||||
"PHILOSOPHY_PSYCHOLOGY_AND_BEHAVIOR",
|
||||
"RELIGION_MYSTICISM_AND_MYTHOLOGY",
|
||||
"CHEMISTRY_AND_MINERALOGY",
|
||||
),
|
||||
},
|
||||
"health": {
|
||||
"strong": ("HEALTH_AND_MEDICINE",),
|
||||
"weak": ("BIOLOGY",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"cinema": {
|
||||
"strong": ("MEDIA_AND_PRESS",),
|
||||
"weak": ("ART_ARCHITECTURE_AND_ARCHAEOLOGY",),
|
||||
"negative": ("HEALTH_AND_MEDICINE", "CHEMISTRY_AND_MINERALOGY"),
|
||||
},
|
||||
"nature": {
|
||||
"strong": (
|
||||
"BIOLOGY",
|
||||
"ANIMALS",
|
||||
"PLANTS",
|
||||
"EARTH",
|
||||
"METEOROLOGY",
|
||||
),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
"ecology": {
|
||||
"strong": ("BIOLOGY", "EARTH", "METEOROLOGY"),
|
||||
"weak": ("GEOGRAPHY_AND_PLACES",),
|
||||
"negative": ("MEDIA_AND_PRESS",),
|
||||
},
|
||||
}
|
||||
|
||||
TOPIC_TEXT_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"transport": (
|
||||
"aereo",
|
||||
"auto",
|
||||
"autobus",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"trasport",
|
||||
"treno",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"health": ("cura", "malato", "medic", "ospedale", "paziente", "salute", "soccorso"),
|
||||
"cinema": ("attore", "cinema", "film", "pellicola", "regia", "spettacolo"),
|
||||
"nature": ("acqua", "animale", "bosco", "fiore", "mare", "montagna", "pianta", "terra"),
|
||||
"ecology": ("ambiente", "ecologia", "inquinamento", "natura", "sostenibile"),
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Fonde lexicon_it_semantic.json con gli arricchimenti BabelNet gia disponibili."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--semantic",
|
||||
type=Path,
|
||||
default=SEMANTIC_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico semantico completo di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--babelnet",
|
||||
type=Path,
|
||||
default=BABELNET_OUTPUT_PATH,
|
||||
help="File con arricchimenti BabelNet parziali.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico arricchito da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topic",
|
||||
default=None,
|
||||
help="Topic opzionale da usare per scegliere il synset BabelNet migliore.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def topic_candidates(entry: Dict[str, object], requested_topic: Optional[str]) -> List[str]:
|
||||
topics = [str(topic).lower() for topic in entry.get("topics", []) if topic]
|
||||
if requested_topic:
|
||||
topics.insert(0, requested_topic.lower())
|
||||
return [topic for topic in dedupe(topics) if topic != "general"]
|
||||
|
||||
|
||||
def synset_text(synset: Dict[str, object]) -> str:
|
||||
fields = []
|
||||
fields.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
fields.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
return " ".join(fields).lower()
|
||||
|
||||
|
||||
def score_synset_for_topic(synset: Dict[str, object], topic: str) -> int:
|
||||
score = 0
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
|
||||
score += 60 * len(domains.intersection(rules.get("strong", ())))
|
||||
score += 25 * len(domains.intersection(rules.get("weak", ())))
|
||||
score -= 35 * len(domains.intersection(rules.get("negative", ())))
|
||||
|
||||
text = synset_text(synset)
|
||||
for keyword in TOPIC_TEXT_KEYWORDS.get(topic, ()):
|
||||
if keyword in text:
|
||||
score += 12
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_synset(
|
||||
babelnet: Dict[str, object], entry: Dict[str, object], requested_topic: Optional[str]
|
||||
) -> Tuple[Optional[Dict[str, object]], Dict[str, int]]:
|
||||
synsets = [item for item in babelnet.get("synsets", []) or [] if isinstance(item, dict)]
|
||||
topics = topic_candidates(entry, requested_topic)
|
||||
if not synsets:
|
||||
return None, {}
|
||||
|
||||
if not topics:
|
||||
best_synset = synsets[0]
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": None,
|
||||
"topic_score": 0,
|
||||
"strong_topic": False,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, {}
|
||||
|
||||
topic_scores: Dict[str, int] = {}
|
||||
best_synset = None
|
||||
best_topic = None
|
||||
best_score = -10_000
|
||||
|
||||
for topic in topics:
|
||||
topic_best = max(score_synset_for_topic(synset, topic) for synset in synsets)
|
||||
topic_scores[topic] = topic_best
|
||||
for synset in synsets:
|
||||
score = score_synset_for_topic(synset, topic)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_topic = topic
|
||||
best_synset = synset
|
||||
|
||||
if not best_synset:
|
||||
return None, topic_scores
|
||||
|
||||
return {
|
||||
"id": best_synset.get("id"),
|
||||
"topic": best_topic,
|
||||
"topic_score": best_score,
|
||||
"strong_topic": best_score >= 40,
|
||||
"senses": best_synset.get("senses", []),
|
||||
"glosses": best_synset.get("glosses", []),
|
||||
"categories": best_synset.get("categories", []),
|
||||
"domains": best_synset.get("domains", []),
|
||||
}, topic_scores
|
||||
|
||||
|
||||
def normalize_babelnet_status(
|
||||
entry: Dict[str, object], babelnet_entry: Optional[Dict[str, object]], requested_topic: Optional[str]
|
||||
) -> Dict[str, object]:
|
||||
if not babelnet_entry:
|
||||
return {"status": "not_requested"}
|
||||
|
||||
raw_babelnet = babelnet_entry.get("babelnet", {})
|
||||
if not isinstance(raw_babelnet, dict):
|
||||
return {"status": "api_error", "reason": "invalid_babelnet_payload"}
|
||||
|
||||
if not raw_babelnet.get("matched"):
|
||||
return {
|
||||
"status": "no_match",
|
||||
"matched": False,
|
||||
"reason": raw_babelnet.get("reason", "no_synsets"),
|
||||
"synsets": [],
|
||||
}
|
||||
|
||||
best_synset, topic_scores = choose_best_synset(raw_babelnet, entry, requested_topic)
|
||||
status = "enriched"
|
||||
if best_synset and int(best_synset.get("topic_score", 0)) <= 0:
|
||||
status = "ambiguous"
|
||||
selected_synset_id = best_synset.get("id") if best_synset else None
|
||||
selected_topic = best_synset.get("topic") if best_synset else None
|
||||
topic_score = int(best_synset.get("topic_score", 0)) if best_synset else 0
|
||||
strong_topic = bool(best_synset.get("strong_topic", False)) if best_synset else False
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"matched": True,
|
||||
"selected_synset_id": selected_synset_id,
|
||||
"selected_topic": selected_topic,
|
||||
"topic_score": topic_score,
|
||||
"strong_topic": strong_topic,
|
||||
"synset_refs": raw_babelnet.get("synset_refs", []),
|
||||
"synsets": raw_babelnet.get("synsets", []),
|
||||
"topic_scores": topic_scores,
|
||||
"best_synset": best_synset,
|
||||
"source_generated_at": babelnet_entry.get("babelnet_generated_at"),
|
||||
}
|
||||
|
||||
|
||||
def build_babelnet_index(payload: Dict[str, object]) -> Dict[Tuple[str, str], Dict[str, object]]:
|
||||
index = {}
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
index[entry_key(entry)] = entry
|
||||
return index
|
||||
|
||||
|
||||
def build_enriched_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
semantic_payload = load_json(args.semantic, {})
|
||||
if not isinstance(semantic_payload, dict) or "entries" not in semantic_payload:
|
||||
raise ValueError(f"Lessico semantico non valido: {args.semantic}")
|
||||
|
||||
babelnet_payload = load_json(args.babelnet, {"entries": []})
|
||||
if not isinstance(babelnet_payload, dict):
|
||||
babelnet_payload = {"entries": []}
|
||||
|
||||
babelnet_index = build_babelnet_index(babelnet_payload)
|
||||
enriched_entries = []
|
||||
status_counts: Dict[str, int] = {}
|
||||
|
||||
for entry in semantic_payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
enriched = deepcopy(entry)
|
||||
babelnet_entry = babelnet_index.get(entry_key(enriched))
|
||||
enriched["babelnet"] = normalize_babelnet_status(enriched, babelnet_entry, args.topic)
|
||||
status = str(enriched["babelnet"].get("status", "unknown"))
|
||||
status_counts[status] = status_counts.get(status, 0) + 1
|
||||
enriched_entries.append(enriched)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": semantic_payload.get("meta", {}).get("language", "it"),
|
||||
"version": 1,
|
||||
"base_lexicon": args.semantic.name,
|
||||
"babelnet_source": args.babelnet.name if args.babelnet.exists() else None,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"requested_topic": args.topic,
|
||||
"entry_count": len(enriched_entries),
|
||||
"babelnet_status_counts": status_counts,
|
||||
},
|
||||
"entries": enriched_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_enriched_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico arricchito generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Stati BabelNet: {payload['meta']['babelnet_status_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
249
build_lexicon.py
Normal file
249
build_lexicon.py
Normal file
@@ -0,0 +1,249 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from build_vocabulary import (
|
||||
FILTERED_OUTPUT_PATH,
|
||||
METADATA_OUTPUT_PATH,
|
||||
build_vocabulary,
|
||||
)
|
||||
|
||||
|
||||
LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it.json")
|
||||
|
||||
POS_BY_TAG = {
|
||||
"function": "PREP",
|
||||
"verb_infinitive": "VERB",
|
||||
"adverb": "ADV",
|
||||
"adjective_like": "ADJ",
|
||||
"noun_like": "NOUN",
|
||||
}
|
||||
|
||||
REGISTER_BY_QUALITY = [
|
||||
(8, "common"),
|
||||
(5, "standard"),
|
||||
(3, "formal"),
|
||||
(0, "rare"),
|
||||
]
|
||||
|
||||
TOPIC_KEYWORDS = {
|
||||
"animals": {
|
||||
"cane", "gatto", "lupo", "volpe", "orso", "pesce", "cervo", "cavallo", "capra", "pecora",
|
||||
"leone", "tigre", "zebra", "aquila", "falco", "serpente", "vipera", "gabbiano", "anatra",
|
||||
"passero", "coniglio", "castoro", "bruco", "cigno", "asino", "alpaca",
|
||||
},
|
||||
"plants": {
|
||||
"albero", "pianta", "fiore", "foglia", "radice", "seme", "bosco", "selva", "ulivo", "quercia",
|
||||
"ortica", "edera", "aloe", "tulipano", "spiga", "polline", "grano", "erba",
|
||||
},
|
||||
"nature": {
|
||||
"natura", "bosco", "selva", "montagna", "collina", "roccia", "pietra", "fiume", "lago", "mare",
|
||||
"riva", "fonte", "onda", "vento", "fuoco", "terra", "sole", "luna", "aurora", "nuvola",
|
||||
"nebbia", "deserto", "isola", "greto", "radice", "fiore", "foglia", "erba", "zolla",
|
||||
},
|
||||
"ecology": {
|
||||
"ambiente", "ecologia", "natura", "bosco", "energia", "acqua", "terra", "clima", "sorgere",
|
||||
"fonte", "solare", "verde", "ulivo", "pianta", "polline", "grano", "radice",
|
||||
},
|
||||
"geography": {
|
||||
"montagna", "collina", "isola", "deserto", "equatore", "ovest", "oriente", "riva", "mare",
|
||||
"lago", "fiume", "ponte", "confine", "quota", "pianeta", "roccia", "greto",
|
||||
},
|
||||
"weather": {
|
||||
"vento", "nebbia", "aurora", "pioggia", "sole", "nuvola", "tempesta", "brina", "sereno",
|
||||
"clima", "goccia",
|
||||
},
|
||||
"sea": {
|
||||
"mare", "onda", "vela", "barca", "porto", "pesce", "ancora", "scoglio", "riva", "veliero",
|
||||
},
|
||||
"mountain": {
|
||||
"montagna", "quota", "vetta", "roccia", "greto", "collina", "sentiero", "alpino",
|
||||
},
|
||||
"health": {
|
||||
"salute", "febbre", "medico", "cura", "respiro", "diuretico", "anemico", "vigore", "energia",
|
||||
"dente", "cuore", "corpo", "viso",
|
||||
},
|
||||
"science": {
|
||||
"atomo", "energia", "metodo", "equatore", "digitale", "misura", "tecnica", "triangolo",
|
||||
"microfibra", "microscopio", "algoritmo", "motore", "materia", "liquido",
|
||||
},
|
||||
"sport": {
|
||||
"calcio", "atleta", "sportivo", "gol", "pallone", "gara", "trionfo", "primato", "allenatore",
|
||||
"stadio", "squadra", "rete",
|
||||
},
|
||||
"history": {
|
||||
"re", "principe", "regno", "impero", "senato", "console", "legione", "vittoria", "epoca",
|
||||
"origine", "ritorno",
|
||||
},
|
||||
"school": {
|
||||
"libro", "quaderno", "lezione", "classe", "studiare", "maestro", "scuola", "esame", "penna",
|
||||
"aula", "figura", "titolo",
|
||||
},
|
||||
"cinema": {
|
||||
"film", "teatro", "attore", "scena", "dialogo", "regista", "pellicola", "cinema",
|
||||
"doppiatore", "documentario", "cinegiornale", "colossal", "commedia", "comparsa",
|
||||
"controfigura", "diva", "divo", "cabaret", "cartoon",
|
||||
},
|
||||
"literature": {
|
||||
"libro", "poesia", "favola", "fiaba", "frase", "parola", "lettura", "autore", "storia",
|
||||
"leggenda", "scrivere", "titolo",
|
||||
},
|
||||
"food": {
|
||||
"pane", "cacao", "gelato", "burro", "latte", "mandorla", "nocciola", "cena", "pranzo",
|
||||
"zuppa", "zucchero", "acqua", "fiore", "frutto",
|
||||
},
|
||||
"city": {
|
||||
"porta", "strada", "piazza", "ponte", "palazzo", "cortile", "villaggio", "citta", "urbano",
|
||||
"casale", "balcone", "finestra", "stazione",
|
||||
},
|
||||
"transport": {
|
||||
"automobile", "auto", "automezzo", "autoveicolo", "autovettura", "autobus", "autocarro",
|
||||
"aeromobile", "aeroplano", "aeroporto", "ambulanza", "autoambulanza", "astronave",
|
||||
"barca", "barchetta", "bastimento", "bicicletta", "bici", "bimotore", "bireattore",
|
||||
"bombardiere", "imbarcazione", "motrice", "motore", "nave", "pista", "porto",
|
||||
"quadrimotore", "reattore", "rimorchio", "rimorchiatore", "rotaia", "ruota", "trattore",
|
||||
"treno", "vapore", "vela", "veliero", "vettura", "volante", "volo",
|
||||
},
|
||||
"work": {
|
||||
"lavoro", "opera", "progetto", "metodo", "tecnica", "strumento", "martello", "guida",
|
||||
"mestiere", "servire",
|
||||
},
|
||||
"home": {
|
||||
"casa", "finestra", "porta", "parete", "divano", "tavolo", "sedia", "camera", "balcone",
|
||||
"camino", "tetto", "cortile", "vasca",
|
||||
},
|
||||
}
|
||||
|
||||
TOPIC_SUFFIXES = {
|
||||
"actions": ("are", "ere", "ire"),
|
||||
"abstract": ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza"),
|
||||
}
|
||||
|
||||
|
||||
def infer_pos(tags: List[str]) -> str:
|
||||
for tag in tags:
|
||||
if tag in POS_BY_TAG:
|
||||
return POS_BY_TAG[tag]
|
||||
return "NOUN"
|
||||
|
||||
|
||||
def infer_topics(word: str, tags: List[str]) -> List[str]:
|
||||
topics = {"general"}
|
||||
|
||||
if "verb_infinitive" in tags:
|
||||
topics.add("actions")
|
||||
if any(word.endswith(suffix) for suffix in ("zione", "zioni", "ismo", "ezza", "ita", "mento", "anza", "enza")):
|
||||
topics.add("abstract")
|
||||
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
if word in keywords:
|
||||
topics.add(topic)
|
||||
|
||||
for topic, suffixes in TOPIC_SUFFIXES.items():
|
||||
if any(word.endswith(suffix) for suffix in suffixes):
|
||||
topics.add(topic)
|
||||
|
||||
if "animals" in topics:
|
||||
topics.add("nature")
|
||||
if "plants" in topics:
|
||||
topics.update({"nature", "ecology"})
|
||||
if "sea" in topics or "mountain" in topics or "weather" in topics:
|
||||
topics.add("nature")
|
||||
if "geography" in topics and "nature" not in topics:
|
||||
topics.add("nature")
|
||||
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def infer_register(quality: int) -> str:
|
||||
for threshold, label in REGISTER_BY_QUALITY:
|
||||
if quality >= threshold:
|
||||
return label
|
||||
return "rare"
|
||||
|
||||
|
||||
def frequency_from_quality(quality: int, index: int, total: int) -> tuple[int, float]:
|
||||
rank = index + 1
|
||||
normalized_rank = 1.0 - (rank - 1) / max(1, total - 1)
|
||||
quality_boost = min(max(quality, 0), 10) / 20.0
|
||||
frequency_score = round(min(1.0, normalized_rank * 0.7 + quality_boost), 4)
|
||||
return rank, frequency_score
|
||||
|
||||
|
||||
def load_words() -> List[str]:
|
||||
if not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists():
|
||||
build_vocabulary()
|
||||
|
||||
words = [
|
||||
line.strip()
|
||||
for line in FILTERED_OUTPUT_PATH.read_text(encoding="utf-8").splitlines()
|
||||
if line.strip()
|
||||
]
|
||||
return words
|
||||
|
||||
|
||||
def load_metadata() -> Dict[str, Dict[str, object]]:
|
||||
if not METADATA_OUTPUT_PATH.exists():
|
||||
build_vocabulary()
|
||||
return json.loads(METADATA_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def build_lexicon() -> Dict[str, object]:
|
||||
words = load_words()
|
||||
metadata = load_metadata()
|
||||
|
||||
entries = []
|
||||
total = len(words)
|
||||
for index, word in enumerate(words):
|
||||
meta = metadata.get(word, {})
|
||||
tags = list(meta.get("tags", []))
|
||||
quality = int(meta.get("quality", 0))
|
||||
frequency_rank, frequency_score = frequency_from_quality(quality, index, total)
|
||||
|
||||
entry = {
|
||||
"form": word,
|
||||
"normalized_form": word,
|
||||
"lemma": word,
|
||||
"pos": infer_pos(tags),
|
||||
"length": len(word),
|
||||
"frequency_rank": frequency_rank,
|
||||
"frequency_score": frequency_score,
|
||||
"difficulty_word": max(1, min(5, 6 - max(1, min(5, quality // 2 + 1)))),
|
||||
"allowed_in_crossword": True,
|
||||
"quality_score": max(0, min(10, quality)),
|
||||
"topics": infer_topics(word, tags),
|
||||
"morph_features": {},
|
||||
"register": infer_register(quality),
|
||||
"source_flags": ["from_filtered_vocabulary", "from_metadata_heuristics"],
|
||||
"crossword_flags": tags,
|
||||
"notes": "",
|
||||
}
|
||||
entries.append(entry)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"sources": ["vocaboli_it_filtrato.txt", "vocaboli_it_metadata.json"],
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(entries),
|
||||
},
|
||||
"entries": entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
lexicon = build_lexicon()
|
||||
LEXICON_OUTPUT_PATH.write_text(
|
||||
json.dumps(lexicon, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
print(f"Lessico generato: {LEXICON_OUTPUT_PATH}")
|
||||
print(f"Voci generate: {lexicon['meta']['entry_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
429
build_llm_rescue_patch.py
Normal file
429
build_llm_rescue_patch.py
Normal file
@@ -0,0 +1,429 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
PATCH_OUTPUT_PATH = Path(__file__).with_name("llm_rescue_patch.json")
|
||||
|
||||
|
||||
SYSTEM_PROMPT = """Sei un lessicografo italiano che prepara definizioni sintetiche per cruciverba.
|
||||
Ricevi un lemma con parte del discorso e contesto semantico parziale.
|
||||
Devi proporre una definizione breve in italiano, topic plausibili e tag semantici.
|
||||
|
||||
Regole:
|
||||
- Rispondi solo con JSON valido.
|
||||
- La definizione deve essere concisa, naturale e utile per un cruciverba.
|
||||
- Evita di includere il lemma o derivati ovvi del lemma nella definizione.
|
||||
- Se il termine sembra raro, ambiguo, refuso o poco affidabile, abbassa la confidenza e segnala needs_human_review=true.
|
||||
- I topic devono essere pochi, in inglese semplice minuscolo con underscore se serve.
|
||||
- I semantic_tags devono essere pochi, descrittivi e in italiano o inglese semplice.
|
||||
- Non inventare dettagli enciclopedici troppo specifici se non supportati dal contesto.
|
||||
|
||||
Formato JSON obbligatorio:
|
||||
{
|
||||
"definition": "...",
|
||||
"topics": ["topic1", "topic2"],
|
||||
"semantic_tags": ["tag1", "tag2"],
|
||||
"confidence": 0.0,
|
||||
"needs_human_review": true,
|
||||
"notes": "..."
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Costruisce una patch di rescue lessicale usando un LLM su un lotto di voci "
|
||||
"prioritarie tratte da to_be_review_priority.json."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=PRIORITY_INPUT_PATH,
|
||||
help="File to_be_review_priority.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PATCH_OUTPUT_PATH,
|
||||
help="Patch JSON da generare o aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=50,
|
||||
help="Numero massimo di voci da processare nel lotto. Usa 0 per tutte le voci selezionate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket",
|
||||
default="red",
|
||||
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider",
|
||||
choices=("openai_compatible", "ollama"),
|
||||
default="openai_compatible",
|
||||
help="Tipo di endpoint LLM da usare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-base",
|
||||
default="",
|
||||
help=(
|
||||
"Endpoint API. Per openai_compatible: .../v1/chat/completions. "
|
||||
"Per ollama: .../api/chat."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-key-env",
|
||||
default="OPENAI_API_KEY",
|
||||
help="Nome della variabile d'ambiente che contiene la API key.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
default="gpt-4.1-mini",
|
||||
help="Nome del modello da interrogare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--temperature",
|
||||
type=float,
|
||||
default=0.2,
|
||||
help="Temperatura della richiesta LLM.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Pausa tra una richiesta e la successiva.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci gia presenti nell'output con status drafted/reviewed/done.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Non chiama alcun LLM: prepara solo il lotto e marca le voci come selected.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def build_record(entry: Dict[str, Any]) -> Dict[str, Any]:
|
||||
wiktextract = entry.get("wiktextract") or {}
|
||||
wiktextract_defs = wiktextract.get("definitions") if isinstance(wiktextract, dict) else []
|
||||
babelnet_best = entry.get("babelnet_best_synset") or {}
|
||||
babelnet_glosses = babelnet_best.get("glosses") if isinstance(babelnet_best, dict) else []
|
||||
return {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"priority_bucket": entry.get("priority_bucket"),
|
||||
"priority_score": entry.get("priority_score"),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_topics": entry.get("topics", []),
|
||||
"current_definition": entry.get("preferred_definition", ""),
|
||||
"current_source": entry.get("preferred_source", ""),
|
||||
"context": {
|
||||
"topic_suggestions": entry.get("topic_suggestions", []),
|
||||
"semantic_glosses": entry.get("semantic_glosses", []),
|
||||
"senses": entry.get("senses", []),
|
||||
"wiktextract_definitions": wiktextract_defs or [],
|
||||
"wiktextract_topic_hints": wiktextract.get("topic_hints", []) if isinstance(wiktextract, dict) else [],
|
||||
"babelnet_glosses": babelnet_glosses or [],
|
||||
},
|
||||
"rescue_definition": "",
|
||||
"rescue_source": "",
|
||||
"rescue_topics": [],
|
||||
"rescue_semantic_tags": [],
|
||||
"rescue_notes": "",
|
||||
"confidence": 0.0,
|
||||
"needs_human_review": True,
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
|
||||
def build_user_prompt(entry: Dict[str, Any]) -> str:
|
||||
context = entry.get("context") or {}
|
||||
payload = {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"current_topics": entry.get("current_topics", []),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_definition": entry.get("current_definition", ""),
|
||||
"context": context,
|
||||
}
|
||||
return (
|
||||
"Genera una proposta di rescue lessicale per questa voce italiana.\n"
|
||||
"Se il termine sembra un refuso o una variante dubbia, segnalalo nelle notes.\n"
|
||||
"Payload:\n"
|
||||
f"{json.dumps(payload, ensure_ascii=False, indent=2)}"
|
||||
)
|
||||
|
||||
|
||||
def resolve_api_base(args: argparse.Namespace) -> str:
|
||||
if args.api_base:
|
||||
return args.api_base
|
||||
if args.provider == "ollama":
|
||||
return "http://localhost:11434/api/chat"
|
||||
return "https://api.openai.com/v1/chat/completions"
|
||||
|
||||
|
||||
def request_openai_compatible(
|
||||
api_base: str,
|
||||
api_key: str,
|
||||
model: str,
|
||||
temperature: float,
|
||||
user_prompt: str,
|
||||
) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"temperature": temperature,
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
}
|
||||
request = urllib.request.Request(
|
||||
api_base,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=90) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"OpenAI-compatible HTTP {exc.code}: {detail}") from exc
|
||||
return str(body["choices"][0]["message"]["content"]).strip()
|
||||
|
||||
|
||||
def request_ollama(
|
||||
api_base: str,
|
||||
model: str,
|
||||
temperature: float,
|
||||
user_prompt: str,
|
||||
) -> str:
|
||||
payload = {
|
||||
"model": model,
|
||||
"stream": False,
|
||||
"options": {"temperature": temperature},
|
||||
"messages": [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
}
|
||||
request = urllib.request.Request(
|
||||
api_base,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(request, timeout=90) as response:
|
||||
body = json.loads(response.read().decode("utf-8"))
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"Ollama HTTP {exc.code}: {detail}") from exc
|
||||
return str((body.get("message") or {}).get("content", "")).strip()
|
||||
|
||||
|
||||
def extract_json_object(text: str) -> Dict[str, Any]:
|
||||
text = text.strip()
|
||||
start = text.find("{")
|
||||
end = text.rfind("}")
|
||||
if start == -1 or end == -1 or end <= start:
|
||||
raise ValueError("Risposta LLM senza oggetto JSON riconoscibile.")
|
||||
return json.loads(text[start : end + 1])
|
||||
|
||||
|
||||
def normalize_llm_payload(payload: Dict[str, Any], model: str) -> Dict[str, Any]:
|
||||
topics = payload.get("topics")
|
||||
tags = payload.get("semantic_tags")
|
||||
confidence = payload.get("confidence", 0.0)
|
||||
return {
|
||||
"rescue_definition": str(payload.get("definition", "")).strip(),
|
||||
"rescue_source": f"llm_rescue:{model}",
|
||||
"rescue_topics": [str(item).strip().lower() for item in (topics or []) if str(item).strip()],
|
||||
"rescue_semantic_tags": [str(item).strip() for item in (tags or []) if str(item).strip()],
|
||||
"rescue_notes": str(payload.get("notes", "")).strip(),
|
||||
"confidence": max(0.0, min(1.0, float(confidence or 0.0))),
|
||||
"needs_human_review": bool(payload.get("needs_human_review", True)),
|
||||
"status": "drafted",
|
||||
}
|
||||
|
||||
|
||||
def should_skip_existing(entry: Dict[str, Any]) -> bool:
|
||||
return str(entry.get("status", "")).lower() in {"drafted", "reviewed", "done"}
|
||||
|
||||
|
||||
def generate_patch(args: argparse.Namespace) -> Dict[str, Any]:
|
||||
source_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(source_payload, dict):
|
||||
raise ValueError(f"File priority non valido: {args.input}")
|
||||
|
||||
output_payload = load_json(args.output, {"entries": []})
|
||||
if not isinstance(output_payload, dict):
|
||||
output_payload = {"entries": []}
|
||||
|
||||
existing_by_form = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in output_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
bucket = str(args.bucket or "red").strip().lower()
|
||||
source_entries = source_payload.get("practical_entries") or source_payload.get("entries") or []
|
||||
|
||||
max_items = int(args.limit)
|
||||
unlimited = max_items <= 0
|
||||
selected: List[Dict[str, Any]] = []
|
||||
skipped_preselection = 0
|
||||
for entry in source_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
||||
continue
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if not form:
|
||||
continue
|
||||
existing = existing_by_form.get(form)
|
||||
if args.skip_existing and existing and should_skip_existing(existing):
|
||||
skipped_preselection += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
if not unlimited and len(selected) >= max(1, max_items):
|
||||
break
|
||||
|
||||
api_base = resolve_api_base(args)
|
||||
api_key = os.environ.get(args.api_key_env, "") if args.provider == "openai_compatible" else ""
|
||||
if not args.dry_run and args.provider == "openai_compatible" and not api_key:
|
||||
raise RuntimeError(
|
||||
f"Variabile d'ambiente {args.api_key_env} non valorizzata per provider openai_compatible."
|
||||
)
|
||||
|
||||
merged_records: List[Dict[str, Any]] = []
|
||||
processed = 0
|
||||
skipped_existing = 0
|
||||
for source_entry in selected:
|
||||
form_key = str(source_entry.get("form", "")).strip().lower()
|
||||
existing = existing_by_form.get(form_key)
|
||||
record = dict(existing) if isinstance(existing, dict) else build_record(source_entry)
|
||||
|
||||
if args.skip_existing and existing and should_skip_existing(existing):
|
||||
skipped_existing += 1
|
||||
merged_records.append(record)
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
record["status"] = "selected"
|
||||
record["rescue_source"] = f"llm_rescue:{args.model}"
|
||||
merged_records.append(record)
|
||||
processed += 1
|
||||
continue
|
||||
|
||||
user_prompt = build_user_prompt(record)
|
||||
try:
|
||||
if args.provider == "ollama":
|
||||
raw_text = request_ollama(api_base, args.model, args.temperature, user_prompt)
|
||||
else:
|
||||
raw_text = request_openai_compatible(
|
||||
api_base,
|
||||
api_key,
|
||||
args.model,
|
||||
args.temperature,
|
||||
user_prompt,
|
||||
)
|
||||
llm_payload = extract_json_object(raw_text)
|
||||
record.update(normalize_llm_payload(llm_payload, args.model))
|
||||
except (urllib.error.URLError, TimeoutError, ValueError, json.JSONDecodeError, RuntimeError) as exc:
|
||||
record["rescue_source"] = f"llm_rescue:{args.model}"
|
||||
record["rescue_notes"] = f"errore_llm: {exc}"
|
||||
record["status"] = "error"
|
||||
record["needs_human_review"] = True
|
||||
merged_records.append(record)
|
||||
processed += 1
|
||||
print(
|
||||
f"[{processed}/{len(selected)}] {record.get('form')}: "
|
||||
f"status={record.get('status')} conf={record.get('confidence', 0.0)}"
|
||||
)
|
||||
if record.get("status") == "error" and record.get("rescue_notes"):
|
||||
print(f" dettaglio: {record.get('rescue_notes')}")
|
||||
if args.sleep > 0:
|
||||
time.sleep(args.sleep)
|
||||
|
||||
seen_forms = {str(item.get("form", "")).lower() for item in merged_records}
|
||||
for form_key, existing in existing_by_form.items():
|
||||
if form_key not in seen_forms:
|
||||
merged_records.append(existing)
|
||||
|
||||
merged_records.sort(
|
||||
key=lambda item: (
|
||||
{"pending": 0, "selected": 1, "error": 2, "drafted": 3, "reviewed": 4, "done": 5}.get(
|
||||
str(item.get("status", "pending")),
|
||||
9,
|
||||
),
|
||||
-int(item.get("priority_score", 0) or 0),
|
||||
str(item.get("form", "")),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_priority": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"batch_bucket": bucket,
|
||||
"batch_limit": int(args.limit),
|
||||
"provider": args.provider,
|
||||
"api_base": api_base,
|
||||
"model": args.model,
|
||||
"dry_run": bool(args.dry_run),
|
||||
"entry_count": len(merged_records),
|
||||
"processed_count": processed,
|
||||
"skipped_existing": skipped_existing,
|
||||
"skipped_preselection": skipped_preselection,
|
||||
},
|
||||
"entries": merged_records,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = generate_patch(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Patch LLM rescue generata: {args.output}")
|
||||
print(f"Voci nel file: {payload['meta']['entry_count']}")
|
||||
print(f"Voci processate in questo run: {payload['meta']['processed_count']}")
|
||||
print(f"Voci saltate per skip-existing: {payload['meta']['skipped_existing']}")
|
||||
print(f"Voci escluse gia in pre-selezione: {payload['meta']['skipped_preselection']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
182
build_review_priority.py
Normal file
182
build_review_priority.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from collections import Counter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
PRIORITY_OUTPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
|
||||
REASON_WEIGHTS = {
|
||||
"no_viable_definition": 100,
|
||||
"proper_noun_collision": 90,
|
||||
"candidate_mentions_answer": 85,
|
||||
"function_word": 80,
|
||||
"very_short_word": 75,
|
||||
"wiktextract_missing": 55,
|
||||
"only_general_topics": 45,
|
||||
"flagged_by_refined_stage": 35,
|
||||
"unresolved_sense_topics": 30,
|
||||
"babelnet_ambiguous": 20,
|
||||
}
|
||||
|
||||
SOURCE_WEIGHTS = {
|
||||
"fallback": 50,
|
||||
"babelnet": 18,
|
||||
"semantic": 8,
|
||||
"wiktextract": 6,
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Costruisce un file di review prioritizzato partendo da to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PRIORITY_OUTPUT_PATH,
|
||||
help="File to_be_review_priority.json da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Numero massimo di voci da tenere nel file priority. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def priority_score(entry: Dict[str, object]) -> Tuple[int, int, int, int, str]:
|
||||
reasons = [str(item) for item in entry.get("review_reasons", []) or []]
|
||||
source = str(entry.get("preferred_source", "")).lower()
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
clue_definitions = entry.get("clue_definitions", {}) or {}
|
||||
form = str(entry.get("form", ""))
|
||||
|
||||
score = sum(REASON_WEIGHTS.get(reason, 5) for reason in reasons)
|
||||
score += SOURCE_WEIGHTS.get(source, 0)
|
||||
|
||||
if not preferred_definition:
|
||||
score += 40
|
||||
|
||||
clue_count = len([value for value in clue_definitions.values() if str(value).strip()])
|
||||
if clue_count == 0:
|
||||
score += 20
|
||||
elif clue_count == 1:
|
||||
score += 8
|
||||
|
||||
score += min(len(reasons), 5) * 3
|
||||
|
||||
if len(form) <= 2:
|
||||
score -= 120
|
||||
elif len(form) == 3:
|
||||
score -= 35
|
||||
|
||||
severe_count = sum(
|
||||
1
|
||||
for reason in reasons
|
||||
if reason in {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
)
|
||||
return (
|
||||
score,
|
||||
severe_count,
|
||||
int(source == "fallback"),
|
||||
-len(preferred_definition),
|
||||
str(entry.get("form", "")),
|
||||
)
|
||||
|
||||
|
||||
def priority_bucket(entry: Dict[str, object]) -> str:
|
||||
reasons = {str(item) for item in entry.get("review_reasons", []) or []}
|
||||
if reasons.intersection({"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}):
|
||||
return "red"
|
||||
if reasons.intersection({"function_word", "very_short_word", "wiktextract_missing", "only_general_topics"}):
|
||||
return "orange"
|
||||
return "yellow"
|
||||
|
||||
|
||||
def compact_entry(entry: Dict[str, object], score_tuple: Tuple[int, int, int, int, str]) -> Dict[str, object]:
|
||||
score = score_tuple[0]
|
||||
compact = dict(entry)
|
||||
compact["priority_score"] = score
|
||||
compact["priority_bucket"] = priority_bucket(entry)
|
||||
return compact
|
||||
|
||||
|
||||
def build_priority_review(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"File review non valido: {args.input}")
|
||||
|
||||
entries = [entry for entry in payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
ranked = sorted(
|
||||
entries,
|
||||
key=priority_score,
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
if args.top > 0:
|
||||
ranked = ranked[: args.top]
|
||||
|
||||
compact_entries = [compact_entry(entry, priority_score(entry)) for entry in ranked]
|
||||
|
||||
practical_entries = [
|
||||
item
|
||||
for item in compact_entries
|
||||
if len(str(item.get("form", ""))) > 2
|
||||
]
|
||||
|
||||
bucket_counter = Counter(item["priority_bucket"] for item in compact_entries)
|
||||
practical_bucket_counter = Counter(item["priority_bucket"] for item in practical_entries)
|
||||
reason_counter = Counter()
|
||||
for item in compact_entries:
|
||||
for reason in item.get("review_reasons", []):
|
||||
reason_counter[str(reason)] += 1
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_review": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(compact_entries),
|
||||
"bucket_counts": dict(bucket_counter),
|
||||
"practical_entry_count": len(practical_entries),
|
||||
"practical_bucket_counts": dict(practical_bucket_counter),
|
||||
"top_reason_counts": dict(reason_counter.most_common(12)),
|
||||
},
|
||||
"entries": compact_entries,
|
||||
"practical_entries": practical_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_priority_review(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Review priority generato: {args.output}")
|
||||
print(f"Voci nel priority file: {payload['meta']['entry_count']}")
|
||||
print(f"Bucket: {payload['meta']['bucket_counts']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
424
build_semantic_lexicon.py
Normal file
424
build_semantic_lexicon.py
Normal file
@@ -0,0 +1,424 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
import xml.etree.ElementTree as ET
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from build_lexicon import LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
IWN_XML_PATH = Path(__file__).with_name("iwn-omw-main") / "IWN-OMW-main" / "data" / "LMF-XML" / "IWN-OMW_LMF_v1.0.xml"
|
||||
SEMANTIC_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_semantic.json")
|
||||
|
||||
IWN_POS_MAP = {
|
||||
"n": "NOUN",
|
||||
"v": "VERB",
|
||||
"a": "ADJ",
|
||||
"s": "ADJ",
|
||||
"r": "ADV",
|
||||
}
|
||||
|
||||
SEMANTIC_TOPIC_KEYWORDS = {
|
||||
"animals": {
|
||||
"animale", "animali", "mammifero", "mammiferi", "uccello", "uccelli", "pesce",
|
||||
"rettile", "domestico", "compagnia", "caccia", "pastorizia",
|
||||
},
|
||||
"plants": {
|
||||
"pianta", "piante", "albero", "alberi", "fiore", "foglia", "foglie", "frutto",
|
||||
"ghianda", "bosco", "radice", "seme", "vegetale",
|
||||
},
|
||||
"nature": {
|
||||
"natura", "naturale", "terra", "acqua", "aria", "mare", "montagna", "bosco",
|
||||
"lago", "fiume", "vento", "roccia", "suolo", "superficie", "terrestre",
|
||||
},
|
||||
"ecology": {
|
||||
"ecologia", "ambiente", "ambientale", "clima", "energia", "naturale", "verde",
|
||||
"ecosistema", "acqua", "terra",
|
||||
},
|
||||
"geography": {
|
||||
"territorio", "superficie", "terrestre", "regione", "confine", "montagna",
|
||||
"isola", "mare", "lago", "fiume",
|
||||
},
|
||||
"weather": {
|
||||
"clima", "pioggia", "vento", "nuvola", "nebbia", "tempesta", "gelo", "brina",
|
||||
"atmosfera",
|
||||
},
|
||||
"sea": {
|
||||
"mare", "marino", "marina", "acque", "salate", "porto", "barca", "vela",
|
||||
"nave", "fondale",
|
||||
},
|
||||
"mountain": {
|
||||
"montagna", "vetta", "collina", "rilievo", "alpino", "roccia", "sentiero",
|
||||
},
|
||||
"health": {
|
||||
"salute", "medico", "medicina", "corpo", "sangue", "cura", "malattia",
|
||||
"terapia", "cervello", "respiro",
|
||||
},
|
||||
"science": {
|
||||
"scienza", "scientifico", "tecnica", "misura", "energia", "materia", "fisica",
|
||||
"chimica", "biologia", "strumento",
|
||||
},
|
||||
"sport": {
|
||||
"sport", "gara", "squadra", "pallone", "atleta", "stadio", "rete", "gioco",
|
||||
"agonistico",
|
||||
},
|
||||
"history": {
|
||||
"storia", "storico", "antico", "regno", "impero", "senato", "romano", "epoca",
|
||||
},
|
||||
"school": {
|
||||
"scuola", "lezione", "studente", "classe", "maestro", "esame", "libro",
|
||||
"quaderno", "aula",
|
||||
},
|
||||
"cinema": {
|
||||
"film", "cinema", "pellicola", "regista", "attore", "scena", "spettacolo",
|
||||
"teatro",
|
||||
},
|
||||
"literature": {
|
||||
"libro", "autore", "lettura", "scrittura", "racconto", "poesia", "romanzo",
|
||||
"letteratura",
|
||||
},
|
||||
"food": {
|
||||
"cibo", "bevanda", "mangiare", "pane", "frutto", "latte", "zucchero", "farina",
|
||||
"gelato",
|
||||
},
|
||||
"city": {
|
||||
"citta", "urbano", "strada", "piazza", "ponte", "palazzo", "stazione", "porta",
|
||||
"quartiere",
|
||||
},
|
||||
"transport": {
|
||||
"veicolo", "trasporto", "strada", "motore", "treno", "ruota", "barca", "nave",
|
||||
"aereo", "automobile",
|
||||
},
|
||||
"work": {
|
||||
"lavoro", "mestiere", "opera", "progetto", "strumento", "tecnica", "servizio",
|
||||
},
|
||||
"home": {
|
||||
"casa", "abitazione", "porta", "finestra", "parete", "camera", "balcone",
|
||||
"tavolo", "sedia",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def normalize_word(text: str) -> str:
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
ascii_only = normalized.encode("ascii", "ignore").decode("ascii")
|
||||
ascii_only = ascii_only.lower().replace("_", "")
|
||||
ascii_only = re.sub(r"[^a-z]", "", ascii_only)
|
||||
return ascii_only
|
||||
|
||||
|
||||
def semantic_topics_from_text(parts: Iterable[str]) -> List[str]:
|
||||
tokens = set()
|
||||
for part in parts:
|
||||
normalized = normalize_word(part)
|
||||
if not normalized:
|
||||
continue
|
||||
tokens.add(normalized)
|
||||
tokens.update(filter(None, re.findall(r"[a-z]+", normalize_word(part))))
|
||||
|
||||
topics = set()
|
||||
for topic, keywords in SEMANTIC_TOPIC_KEYWORDS.items():
|
||||
if any(keyword in tokens for keyword in keywords):
|
||||
topics.add(topic)
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def parse_iwn() -> Tuple[Dict[str, Dict[str, object]], Dict[str, List[Dict[str, object]]]]:
|
||||
xml_text = IWN_XML_PATH.read_text(encoding="utf-8")
|
||||
xml_text = xml_text.replace('" -->', '">')
|
||||
root = ET.fromstring(xml_text)
|
||||
|
||||
synsets: Dict[str, Dict[str, object]] = {}
|
||||
entries_by_norm: Dict[str, List[Dict[str, object]]] = defaultdict(list)
|
||||
|
||||
for synset in root.findall(".//Synset"):
|
||||
synset_id = synset.attrib.get("id", "")
|
||||
relations = [
|
||||
{
|
||||
"type": relation.attrib.get("relType", ""),
|
||||
"target": relation.attrib.get("target", ""),
|
||||
"subtype": relation.attrib.get("{https://globalwordnet.github.io/schemas/dc/}type", ""),
|
||||
}
|
||||
for relation in synset.findall("SynsetRelation")
|
||||
]
|
||||
synsets[synset_id] = {
|
||||
"id": synset_id,
|
||||
"ili": synset.attrib.get("ili", ""),
|
||||
"definition": (synset.findtext("Definition") or "").strip(),
|
||||
"relations": relations,
|
||||
"lemmas": [],
|
||||
"pos": "",
|
||||
}
|
||||
|
||||
for lexical_entry in root.findall(".//LexicalEntry"):
|
||||
lemma = lexical_entry.find("Lemma")
|
||||
if lemma is None:
|
||||
continue
|
||||
|
||||
written_form = lemma.attrib.get("writtenForm", "").strip()
|
||||
normalized_form = normalize_word(written_form)
|
||||
if not normalized_form:
|
||||
continue
|
||||
|
||||
pos = IWN_POS_MAP.get(lemma.attrib.get("partOfSpeech", "").strip().lower(), "NOUN")
|
||||
senses = lexical_entry.findall("Sense")
|
||||
sense_payloads = []
|
||||
|
||||
for sense in senses:
|
||||
synset_id = sense.attrib.get("synset", "")
|
||||
sense_id = sense.attrib.get("id", "")
|
||||
if not synset_id or synset_id not in synsets:
|
||||
continue
|
||||
|
||||
synsets[synset_id]["lemmas"].append(written_form)
|
||||
synsets[synset_id]["pos"] = pos
|
||||
sense_payloads.append(
|
||||
{
|
||||
"sense_id": sense_id,
|
||||
"synset_id": synset_id,
|
||||
"pos": pos,
|
||||
}
|
||||
)
|
||||
|
||||
if sense_payloads:
|
||||
entries_by_norm[normalized_form].append(
|
||||
{
|
||||
"written_form": written_form,
|
||||
"normalized_form": normalized_form,
|
||||
"pos": pos,
|
||||
"senses": sense_payloads,
|
||||
}
|
||||
)
|
||||
|
||||
for synset in synsets.values():
|
||||
unique_lemmas = []
|
||||
seen = set()
|
||||
for lemma in synset["lemmas"]:
|
||||
if lemma not in seen:
|
||||
seen.add(lemma)
|
||||
unique_lemmas.append(lemma)
|
||||
synset["lemmas"] = unique_lemmas
|
||||
|
||||
return synsets, entries_by_norm
|
||||
|
||||
|
||||
def score_sense(
|
||||
synset_id: str,
|
||||
current_topics: Iterable[str],
|
||||
synsets: Dict[str, Dict[str, object]],
|
||||
) -> int:
|
||||
synset = synsets.get(synset_id, {})
|
||||
definition = str(synset.get("definition", ""))
|
||||
inferred_topics = set(semantic_topics_from_text([definition] + list(synset.get("lemmas", []))))
|
||||
current_topics_set = set(str(topic) for topic in current_topics)
|
||||
overlap = len(inferred_topics & current_topics_set)
|
||||
return overlap * 10 + len(definition)
|
||||
|
||||
|
||||
def best_candidate(
|
||||
candidates: List[Dict[str, object]],
|
||||
expected_pos: str,
|
||||
current_topics: Iterable[str],
|
||||
synsets: Dict[str, Dict[str, object]],
|
||||
) -> Dict[str, object]:
|
||||
ranked = []
|
||||
for candidate in candidates:
|
||||
pos_bonus = 100 if candidate["pos"] == expected_pos else 0
|
||||
sense_bonus = 0
|
||||
for sense in candidate.get("senses", []):
|
||||
sense_bonus = max(
|
||||
sense_bonus,
|
||||
score_sense(str(sense.get("synset_id", "")), current_topics, synsets),
|
||||
)
|
||||
ranked.append((pos_bonus + sense_bonus, candidate))
|
||||
ranked.sort(key=lambda item: item[0], reverse=True)
|
||||
return ranked[0][1]
|
||||
|
||||
|
||||
def dedupe_keep_order(items: Iterable[str]) -> List[str]:
|
||||
seen = set()
|
||||
result = []
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text or text in seen:
|
||||
continue
|
||||
seen.add(text)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def resolve_related_lemmas(
|
||||
synset_ids: Iterable[str],
|
||||
relation_type: str,
|
||||
synsets: Dict[str, Dict[str, object]],
|
||||
) -> List[str]:
|
||||
related = []
|
||||
for synset_id in synset_ids:
|
||||
synset = synsets.get(synset_id)
|
||||
if not synset:
|
||||
continue
|
||||
for relation in synset.get("relations", []):
|
||||
if relation.get("type") != relation_type:
|
||||
continue
|
||||
target = relation.get("target", "")
|
||||
target_synset = synsets.get(target)
|
||||
if not target_synset:
|
||||
continue
|
||||
related.extend(target_synset.get("lemmas", []))
|
||||
return dedupe_keep_order(related)
|
||||
|
||||
|
||||
def collect_relation_terms(
|
||||
synset_ids: Iterable[str],
|
||||
relation_types: Iterable[str],
|
||||
synsets: Dict[str, Dict[str, object]],
|
||||
) -> Dict[str, List[str]]:
|
||||
return {
|
||||
relation_type: resolve_related_lemmas(synset_ids, relation_type, synsets)[:20]
|
||||
for relation_type in relation_types
|
||||
}
|
||||
|
||||
|
||||
def enrich_entry(
|
||||
entry: Dict[str, object],
|
||||
synsets: Dict[str, Dict[str, object]],
|
||||
entries_by_norm: Dict[str, List[Dict[str, object]]],
|
||||
) -> Dict[str, object]:
|
||||
normalized_candidates = dedupe_keep_order(
|
||||
[
|
||||
normalize_word(str(entry.get("form", ""))),
|
||||
normalize_word(str(entry.get("lemma", ""))),
|
||||
normalize_word(str(entry.get("normalized_form", ""))),
|
||||
]
|
||||
)
|
||||
matches: List[Dict[str, object]] = []
|
||||
for candidate_key in normalized_candidates:
|
||||
matches.extend(entries_by_norm.get(candidate_key, []))
|
||||
|
||||
if not matches:
|
||||
entry["semantic"] = {
|
||||
"source": "iwn-omw",
|
||||
"matched": False,
|
||||
"match_count": 0,
|
||||
"synsets": [],
|
||||
"synonyms": [],
|
||||
"raw_relation_terms": {},
|
||||
"glosses": [],
|
||||
"semantic_topics": [],
|
||||
}
|
||||
return entry
|
||||
|
||||
selected = best_candidate(matches, str(entry.get("pos", "")), entry.get("topics", []), synsets)
|
||||
sorted_senses = sorted(
|
||||
selected.get("senses", []),
|
||||
key=lambda sense: score_sense(str(sense.get("synset_id", "")), entry.get("topics", []), synsets),
|
||||
reverse=True,
|
||||
)
|
||||
synset_ids = [sense["synset_id"] for sense in sorted_senses]
|
||||
synset_payloads = []
|
||||
glosses = []
|
||||
synonyms = []
|
||||
for synset_id in synset_ids[:3]:
|
||||
synset = synsets.get(synset_id)
|
||||
if not synset:
|
||||
continue
|
||||
gloss = str(synset.get("definition", "")).strip()
|
||||
glosses.append(gloss)
|
||||
synset_payloads.append(
|
||||
{
|
||||
"id": synset_id,
|
||||
"pos": synset.get("pos", ""),
|
||||
"definition": gloss,
|
||||
"lemmas": dedupe_keep_order(synset.get("lemmas", []))[:12],
|
||||
"raw_relation_terms": collect_relation_terms(
|
||||
[synset_id],
|
||||
("hypernym", "hyponym", "similar"),
|
||||
synsets,
|
||||
),
|
||||
}
|
||||
)
|
||||
synonyms.extend(synset.get("lemmas", []))
|
||||
|
||||
raw_relation_terms = collect_relation_terms(
|
||||
synset_ids,
|
||||
("hypernym", "hyponym", "similar"),
|
||||
synsets,
|
||||
)
|
||||
synonyms = [
|
||||
lemma
|
||||
for lemma in dedupe_keep_order(synonyms)
|
||||
if normalize_word(lemma) != normalize_word(str(entry.get("form", "")))
|
||||
][:20]
|
||||
glosses = dedupe_keep_order(glosses)
|
||||
semantic_topics = dedupe_keep_order(
|
||||
semantic_topics_from_text(
|
||||
glosses
|
||||
+ synonyms
|
||||
+ raw_relation_terms.get("hypernym", [])
|
||||
+ raw_relation_terms.get("hyponym", [])
|
||||
+ raw_relation_terms.get("similar", [])
|
||||
)
|
||||
)
|
||||
entry["semantic"] = {
|
||||
"source": "iwn-omw",
|
||||
"matched": True,
|
||||
"match_count": len(matches),
|
||||
"selected_form": selected.get("written_form", ""),
|
||||
"synsets": synset_payloads,
|
||||
"synonyms": synonyms,
|
||||
"raw_relation_terms": raw_relation_terms,
|
||||
"glosses": glosses,
|
||||
"semantic_topics": semantic_topics,
|
||||
}
|
||||
return entry
|
||||
|
||||
|
||||
def build_semantic_lexicon() -> Dict[str, object]:
|
||||
if not LEXICON_OUTPUT_PATH.exists():
|
||||
raise FileNotFoundError(f"Lessico di base non trovato: {LEXICON_OUTPUT_PATH}")
|
||||
if not IWN_XML_PATH.exists():
|
||||
raise FileNotFoundError(f"File Open ItalWordNet non trovato: {IWN_XML_PATH}")
|
||||
|
||||
payload = json.loads(LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))
|
||||
synsets, entries_by_norm = parse_iwn()
|
||||
|
||||
enriched_entries = []
|
||||
for entry in payload.get("entries", []):
|
||||
enriched_entries.append(enrich_entry(dict(entry), synsets, entries_by_norm))
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": str(LEXICON_OUTPUT_PATH.name),
|
||||
"sources": [
|
||||
"lexicon_it.json",
|
||||
"iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml",
|
||||
],
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(enriched_entries),
|
||||
"semantic_source": "IWN-OMW v1.0",
|
||||
},
|
||||
"entries": enriched_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
payload = build_semantic_lexicon()
|
||||
SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
matched = sum(1 for entry in payload["entries"] if entry.get("semantic", {}).get("matched"))
|
||||
print(f"Lessico semantico generato: {SEMANTIC_LEXICON_OUTPUT_PATH}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Voci con match semantico: {matched}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
153
build_treccani_rescue_patch.py
Normal file
153
build_treccani_rescue_patch.py
Normal file
@@ -0,0 +1,153 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
PRIORITY_INPUT_PATH = Path(__file__).with_name("to_be_review_priority.json")
|
||||
PATCH_OUTPUT_PATH = Path(__file__).with_name("treccani_rescue_patch.json")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Estrae un lotto prioritario dal file to_be_review_priority.json per preparare una patch "
|
||||
"manuale/assistita di rescue lessicale."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=PRIORITY_INPUT_PATH,
|
||||
help="File to_be_review_priority.json di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=PATCH_OUTPUT_PATH,
|
||||
help="Patch JSON da generare o aggiornare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=100,
|
||||
help="Numero massimo di voci da preparare nel lotto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bucket",
|
||||
default="red",
|
||||
help="Bucket di priorita da considerare: red, orange, yellow oppure all.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def build_record(entry: Dict[str, object]) -> Dict[str, object]:
|
||||
return {
|
||||
"form": entry.get("form"),
|
||||
"lemma": entry.get("lemma"),
|
||||
"pos": entry.get("pos"),
|
||||
"priority_bucket": entry.get("priority_bucket"),
|
||||
"priority_score": entry.get("priority_score"),
|
||||
"review_reasons": entry.get("review_reasons", []),
|
||||
"current_topics": entry.get("topics", []),
|
||||
"current_definition": entry.get("preferred_definition", ""),
|
||||
"current_source": entry.get("preferred_source", ""),
|
||||
"rescue_definition": "",
|
||||
"rescue_source": "treccani_rescue",
|
||||
"rescue_topics": [],
|
||||
"rescue_semantic_tags": [],
|
||||
"rescue_notes": "",
|
||||
"status": "pending",
|
||||
}
|
||||
|
||||
|
||||
def build_patch(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
raise ValueError(f"File priority non valido: {args.input}")
|
||||
|
||||
existing_patch = load_json(args.output, {"entries": []})
|
||||
if not isinstance(existing_patch, dict):
|
||||
existing_patch = {"entries": []}
|
||||
|
||||
existing_by_form = {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in existing_patch.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
bucket = str(args.bucket or "red").strip().lower()
|
||||
source_entries = payload.get("practical_entries") or payload.get("entries") or []
|
||||
|
||||
selected: List[Dict[str, object]] = []
|
||||
for entry in source_entries:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if bucket != "all" and str(entry.get("priority_bucket", "")).lower() != bucket:
|
||||
continue
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if not form:
|
||||
continue
|
||||
selected.append(entry)
|
||||
if len(selected) >= max(1, int(args.limit)):
|
||||
break
|
||||
|
||||
merged_records = []
|
||||
seen = set()
|
||||
for entry in selected:
|
||||
form = str(entry.get("form", "")).strip().lower()
|
||||
if form in existing_by_form:
|
||||
merged_records.append(existing_by_form[form])
|
||||
else:
|
||||
merged_records.append(build_record(entry))
|
||||
seen.add(form)
|
||||
|
||||
for form, entry in existing_by_form.items():
|
||||
if form not in seen:
|
||||
merged_records.append(entry)
|
||||
|
||||
merged_records.sort(
|
||||
key=lambda item: (
|
||||
{"pending": 0, "drafted": 1, "reviewed": 2, "done": 3}.get(str(item.get("status", "pending")), 9),
|
||||
-int(item.get("priority_score", 0) or 0),
|
||||
str(item.get("form", "")),
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_priority": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"batch_bucket": bucket,
|
||||
"batch_limit": int(args.limit),
|
||||
"entry_count": len(merged_records),
|
||||
},
|
||||
"entries": merged_records,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_patch(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Patch rescue generata: {args.output}")
|
||||
print(f"Voci nel lotto: {payload['meta']['entry_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
243
build_vocabulary.py
Normal file
243
build_vocabulary.py
Normal file
@@ -0,0 +1,243 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
|
||||
PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
|
||||
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
|
||||
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
|
||||
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
|
||||
MIN_WORD_LENGTH = 2
|
||||
MAX_WORD_LENGTH = 14
|
||||
|
||||
COMMON_FUNCTION_WORDS = {
|
||||
"a",
|
||||
"ad",
|
||||
"al",
|
||||
"allo",
|
||||
"ai",
|
||||
"agli",
|
||||
"alla",
|
||||
"alle",
|
||||
"con",
|
||||
"col",
|
||||
"coi",
|
||||
"da",
|
||||
"dal",
|
||||
"dallo",
|
||||
"dai",
|
||||
"dagli",
|
||||
"dalla",
|
||||
"dalle",
|
||||
"di",
|
||||
"del",
|
||||
"dello",
|
||||
"dei",
|
||||
"degli",
|
||||
"della",
|
||||
"delle",
|
||||
"e",
|
||||
"ed",
|
||||
"in",
|
||||
"nel",
|
||||
"nello",
|
||||
"nei",
|
||||
"negli",
|
||||
"nella",
|
||||
"nelle",
|
||||
"o",
|
||||
"od",
|
||||
"per",
|
||||
"su",
|
||||
"sul",
|
||||
"sullo",
|
||||
"sui",
|
||||
"sugli",
|
||||
"sulla",
|
||||
"sulle",
|
||||
"tra",
|
||||
"fra",
|
||||
}
|
||||
|
||||
COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
|
||||
COMMON_ADVERB_SUFFIXES = ("mente",)
|
||||
COMMON_NOUN_SUFFIXES = (
|
||||
"zione",
|
||||
"zioni",
|
||||
"tore",
|
||||
"tori",
|
||||
"trice",
|
||||
"trici",
|
||||
"ista",
|
||||
"isti",
|
||||
"ismo",
|
||||
"ismi",
|
||||
"anza",
|
||||
"enze",
|
||||
"enza",
|
||||
"ezza",
|
||||
"ezze",
|
||||
"ita",
|
||||
"ore",
|
||||
"ori",
|
||||
)
|
||||
COMMON_ADJECTIVE_SUFFIXES = (
|
||||
"ale",
|
||||
"ali",
|
||||
"oso",
|
||||
"osa",
|
||||
"osi",
|
||||
"ose",
|
||||
"ivo",
|
||||
"iva",
|
||||
"ivi",
|
||||
"ive",
|
||||
"ente",
|
||||
"enti",
|
||||
)
|
||||
|
||||
|
||||
def maybe_fix_mojibake(text: str) -> str:
|
||||
if "Ã" not in text and "Â" not in text:
|
||||
return text
|
||||
try:
|
||||
return text.encode("latin-1").decode("utf-8")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
|
||||
|
||||
def strip_accents(text: str) -> str:
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
return "".join(char for char in normalized if not unicodedata.combining(char))
|
||||
|
||||
|
||||
def normalize_word(word: str) -> Optional[str]:
|
||||
clean = maybe_fix_mojibake(word.strip().lower())
|
||||
clean = clean.replace("’", "'").replace("`", "'")
|
||||
clean = strip_accents(clean)
|
||||
clean = clean.replace("'", "")
|
||||
clean = clean.replace("-", "")
|
||||
clean = clean.replace(" ", "")
|
||||
|
||||
if len(clean) < MIN_WORD_LENGTH:
|
||||
return None
|
||||
if not re.fullmatch(r"[a-z]+", clean):
|
||||
return None
|
||||
return clean
|
||||
|
||||
|
||||
def categorize_word(word: str) -> Dict[str, object]:
|
||||
tags: List[str] = []
|
||||
score = 0
|
||||
|
||||
if word in COMMON_FUNCTION_WORDS:
|
||||
tags.append("function")
|
||||
score += 6
|
||||
|
||||
if word.endswith(COMMON_VERB_SUFFIXES):
|
||||
tags.append("verb_infinitive")
|
||||
score += 4
|
||||
|
||||
if word.endswith(COMMON_ADVERB_SUFFIXES):
|
||||
tags.append("adverb")
|
||||
score += 3
|
||||
|
||||
if word.endswith(COMMON_NOUN_SUFFIXES):
|
||||
tags.append("noun_like")
|
||||
score += 3
|
||||
|
||||
if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
|
||||
tags.append("adjective_like")
|
||||
score += 2
|
||||
|
||||
if len(word) <= 4:
|
||||
tags.append("short")
|
||||
score += 2
|
||||
elif 5 <= len(word) <= 9:
|
||||
tags.append("medium")
|
||||
score += 3
|
||||
else:
|
||||
tags.append("long")
|
||||
score += 1
|
||||
|
||||
if len(set(word)) >= max(4, len(word) // 2):
|
||||
tags.append("varied_letters")
|
||||
score += 2
|
||||
|
||||
penalty = 0
|
||||
repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
|
||||
if repeated_run >= 3:
|
||||
tags.append("repetition_penalty")
|
||||
penalty += 3
|
||||
|
||||
consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
|
||||
if consonant_clusters:
|
||||
tags.append("cluster_penalty")
|
||||
penalty += 2
|
||||
|
||||
if len(word) > MAX_WORD_LENGTH:
|
||||
tags.append("too_long")
|
||||
penalty += 6
|
||||
|
||||
quality = score - penalty
|
||||
return {"tags": sorted(set(tags)), "quality": quality}
|
||||
|
||||
|
||||
def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
|
||||
tags = set(meta["tags"])
|
||||
quality = int(meta["quality"])
|
||||
|
||||
if "too_long" in tags:
|
||||
return False
|
||||
if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
|
||||
return False
|
||||
if quality < 2:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def extract_words(raw_words: Iterable[str]) -> List[str]:
|
||||
normalized = set()
|
||||
for word in raw_words:
|
||||
clean = normalize_word(word)
|
||||
if clean is not None:
|
||||
normalized.add(clean)
|
||||
return sorted(normalized)
|
||||
|
||||
|
||||
def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
|
||||
payload = json.loads(source_path.read_text(encoding="utf-8"))
|
||||
words = extract_words(payload.keys())
|
||||
output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
|
||||
|
||||
metadata = {word: categorize_word(word) for word in words}
|
||||
filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
|
||||
|
||||
FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
|
||||
METADATA_OUTPUT_PATH.write_text(
|
||||
json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return {
|
||||
"extended_words": len(words),
|
||||
"filtered_words": len(filtered_words),
|
||||
"metadata_entries": len(metadata),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
totals = build_vocabulary()
|
||||
print(f"Vocabolario esteso: {OUTPUT_PATH}")
|
||||
print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
|
||||
print(f"Metadati: {METADATA_OUTPUT_PATH}")
|
||||
print(f"Parole estese: {totals['extended_words']}")
|
||||
print(f"Parole filtrate: {totals['filtered_words']}")
|
||||
print(f"Metadati generati: {totals['metadata_entries']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
423
clue_generator.py
Normal file
423
clue_generator.py
Normal file
@@ -0,0 +1,423 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH, TOPIC_DOMAIN_RULES, load_json
|
||||
from crossword_generator import HORIZONTAL, Placement
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Clue:
|
||||
number: int
|
||||
word: str
|
||||
direction: str
|
||||
x: int
|
||||
y: int
|
||||
text: str
|
||||
source: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClueCandidate:
|
||||
text: str
|
||||
source: str
|
||||
family: str
|
||||
difficulty_hint: str
|
||||
topic_score: int
|
||||
strong_topic: bool
|
||||
|
||||
|
||||
DIFFICULTY_ALIASES = {
|
||||
"1": "easy",
|
||||
"2": "medium",
|
||||
"3": "hard",
|
||||
"4": "expert",
|
||||
"5": "expert",
|
||||
"easy": "easy",
|
||||
"medium": "medium",
|
||||
"hard": "hard",
|
||||
"expert": "expert",
|
||||
}
|
||||
|
||||
GENERIC_CLUE_PATTERNS = (
|
||||
"termine da ricavare dagli incroci",
|
||||
"termine lessicale collegato",
|
||||
"collegato a:",
|
||||
)
|
||||
|
||||
|
||||
def load_enriched_entries(path: Path = ENRICHED_LEXICON_OUTPUT_PATH) -> Dict[str, Dict[str, object]]:
|
||||
payload = load_json(path, {"entries": []})
|
||||
if not isinstance(payload, dict):
|
||||
return {}
|
||||
return {
|
||||
str(entry.get("form", "")).lower(): entry
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict) and entry.get("form")
|
||||
}
|
||||
|
||||
|
||||
def normalize_difficulty(value: Optional[str]) -> str:
|
||||
return DIFFICULTY_ALIASES.get(str(value or "medium").strip().lower(), "medium")
|
||||
|
||||
|
||||
def clean_definition(text: str, answer: str) -> str:
|
||||
clue = str(text or "")
|
||||
clue = re.sub(r"\[[^\]]*\]", " ", clue)
|
||||
clue = re.sub(r"\s+", " ", clue).strip(" .;:-")
|
||||
if not clue:
|
||||
return ""
|
||||
clue = re.sub(re.escape(answer), "questa parola", clue, flags=re.IGNORECASE)
|
||||
clue = re.sub(r"\(\s*\)", "", clue)
|
||||
clue = re.sub(r"\s+,", ",", clue)
|
||||
clue = re.sub(r"\s+;", ";", clue)
|
||||
if clue and clue[0].islower():
|
||||
clue = clue[0].upper() + clue[1:]
|
||||
return clue + "."
|
||||
|
||||
|
||||
def synset_has_strong_topic_domain(synset: Dict[str, object], topic: Optional[str]) -> bool:
|
||||
if not topic or topic == "general":
|
||||
return True
|
||||
rules = TOPIC_DOMAIN_RULES.get(topic, {})
|
||||
strong_domains = {str(domain).upper() for domain in rules.get("strong", ())}
|
||||
if not strong_domains:
|
||||
return True
|
||||
domains = {str(domain).upper() for domain in synset.get("domains", []) or []}
|
||||
return bool(domains.intersection(strong_domains))
|
||||
|
||||
|
||||
def text_contains_answer(text: str, answer: str) -> bool:
|
||||
return bool(re.search(re.escape(answer), text, flags=re.IGNORECASE))
|
||||
|
||||
|
||||
def directness_score(text: str) -> int:
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
direct_keywords = (
|
||||
"strumento",
|
||||
"veicolo",
|
||||
"animale",
|
||||
"pianta",
|
||||
"titolo",
|
||||
"edificio",
|
||||
"persona",
|
||||
"luogo",
|
||||
"malattia",
|
||||
"farmaco",
|
||||
"mezzo",
|
||||
"parte di",
|
||||
)
|
||||
for keyword in direct_keywords:
|
||||
if keyword in lowered:
|
||||
score += 8
|
||||
if any(marker in lowered for marker in ("cioè", "ossia", "ovvero")):
|
||||
score += 4
|
||||
return score
|
||||
|
||||
|
||||
def preferred_length_range(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 24, 90
|
||||
if difficulty == "medium":
|
||||
return 20, 75
|
||||
if difficulty == "hard":
|
||||
return 16, 60
|
||||
return 14, 50
|
||||
|
||||
|
||||
def score_candidate(candidate: ClueCandidate, answer: str, difficulty: str) -> int:
|
||||
text = candidate.text
|
||||
lowered = text.lower()
|
||||
score = 0
|
||||
|
||||
if not text or len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if any(pattern in lowered for pattern in GENERIC_CLUE_PATTERNS):
|
||||
score -= 120
|
||||
|
||||
if text_contains_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 40
|
||||
|
||||
min_len, max_len = preferred_length_range(difficulty)
|
||||
length = len(text)
|
||||
if min_len <= length <= max_len:
|
||||
score += 28
|
||||
else:
|
||||
score -= abs(length - max_len) if length > max_len else abs(min_len - length) // 2
|
||||
|
||||
directness = directness_score(text)
|
||||
if difficulty == "easy":
|
||||
score += directness * 2
|
||||
elif difficulty == "medium":
|
||||
score += directness
|
||||
elif difficulty == "hard":
|
||||
score -= max(0, directness - 6)
|
||||
else:
|
||||
score -= directness
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 56,
|
||||
"semantic_gloss": 34,
|
||||
"refined_sense": 30,
|
||||
"babelnet_best_gloss": 18,
|
||||
"babelnet_gloss": 10,
|
||||
"fallback": 0,
|
||||
}
|
||||
score += family_bonus.get(candidate.family, 0)
|
||||
|
||||
difficulty_pref = {
|
||||
"easy": {"direct", "balanced"},
|
||||
"medium": {"balanced", "direct"},
|
||||
"hard": {"balanced", "oblique"},
|
||||
"expert": {"oblique", "balanced"},
|
||||
}
|
||||
if candidate.difficulty_hint in difficulty_pref.get(difficulty, {"balanced"}):
|
||||
score += 18
|
||||
|
||||
if difficulty == "easy" and ";" in text:
|
||||
score += 8
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 8
|
||||
|
||||
if candidate.topic_score >= 40:
|
||||
score += 18
|
||||
elif candidate.topic_score > 0:
|
||||
score += 8
|
||||
elif candidate.family in {"babelnet_best_gloss", "babelnet_gloss"}:
|
||||
score -= 140
|
||||
|
||||
if candidate.strong_topic:
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", lowered):
|
||||
score -= 28
|
||||
|
||||
if length > 120:
|
||||
score -= 45
|
||||
if length > 180:
|
||||
score -= 90
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def candidate_hint(text: str, family: str) -> str:
|
||||
lowered = text.lower()
|
||||
if family in {"semantic_definition", "semantic_gloss"} and len(text) <= 70:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("fig.", "figurato", "poetico", "letterario")):
|
||||
return "oblique"
|
||||
if len(text) > 85:
|
||||
return "direct"
|
||||
return "balanced"
|
||||
|
||||
|
||||
def add_candidate(
|
||||
candidates: List[ClueCandidate],
|
||||
seen: set[Tuple[str, str]],
|
||||
*,
|
||||
text: str,
|
||||
answer: str,
|
||||
source: str,
|
||||
family: str,
|
||||
topic_score: int = 0,
|
||||
strong_topic: bool = False,
|
||||
) -> None:
|
||||
cleaned = clean_definition(text, answer)
|
||||
if not cleaned:
|
||||
return
|
||||
key = (cleaned.lower(), family)
|
||||
if key in seen:
|
||||
return
|
||||
seen.add(key)
|
||||
candidates.append(
|
||||
ClueCandidate(
|
||||
text=cleaned,
|
||||
source=source,
|
||||
family=family,
|
||||
difficulty_hint=candidate_hint(cleaned, family),
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def semantic_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
semantic = entry.get("semantic", {})
|
||||
if not isinstance(semantic, dict):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(synset.get("definition", "")),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
)
|
||||
|
||||
for gloss in semantic.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="semantic",
|
||||
family="semantic_gloss",
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def babelnet_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if not isinstance(babelnet, dict) or babelnet.get("status") not in {"enriched", "ambiguous"}:
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
topic_score = int(best_synset.get("topic_score", 0) or 0)
|
||||
strong_topic = bool(best_synset.get("strong_topic")) or synset_has_strong_topic_domain(best_synset, topic)
|
||||
for gloss in best_synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_best_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=strong_topic,
|
||||
)
|
||||
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
if topic and topic != "general" and not synset_has_strong_topic_domain(synset, topic):
|
||||
continue
|
||||
topic_score = 40 if topic and topic != "general" and synset_has_strong_topic_domain(synset, topic) else 0
|
||||
for gloss in synset.get("glosses", []) or []:
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(gloss),
|
||||
answer=answer,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
topic_score=topic_score,
|
||||
strong_topic=topic_score >= 40,
|
||||
)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def refined_sense_candidates(entry: Dict[str, object], answer: str) -> List[ClueCandidate]:
|
||||
senses = entry.get("senses", [])
|
||||
if not isinstance(senses, list):
|
||||
return []
|
||||
|
||||
candidates: List[ClueCandidate] = []
|
||||
seen: set[Tuple[str, str]] = set()
|
||||
for sense in senses:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
confidence = float(sense.get("confidence", 0.0) or 0.0)
|
||||
add_candidate(
|
||||
candidates,
|
||||
seen,
|
||||
text=str(sense.get("definition", "")),
|
||||
answer=answer,
|
||||
source=str(sense.get("source", "refined")),
|
||||
family="refined_sense",
|
||||
topic_score=int(confidence * 100),
|
||||
strong_topic=confidence >= 0.75,
|
||||
)
|
||||
return candidates
|
||||
|
||||
|
||||
def fallback_definition(entry: Dict[str, object], answer: str) -> str:
|
||||
pos = str(entry.get("pos", "")).lower()
|
||||
topics = ", ".join(str(topic) for topic in entry.get("topics", []) if topic and str(topic).lower() != "general")
|
||||
if topics:
|
||||
return f"Termine {pos or 'lessicale'} collegato all'ambito: {topics}."
|
||||
return "Termine da ricavare dagli incroci."
|
||||
|
||||
|
||||
def all_candidates(entry: Dict[str, object], answer: str, topic: Optional[str]) -> List[ClueCandidate]:
|
||||
candidates: List[ClueCandidate] = []
|
||||
candidates.extend(semantic_candidates(entry, answer))
|
||||
candidates.extend(refined_sense_candidates(entry, answer))
|
||||
candidates.extend(babelnet_candidates(entry, answer, topic))
|
||||
return candidates
|
||||
|
||||
|
||||
def choose_candidate(candidates: Sequence[ClueCandidate], answer: str, difficulty: str) -> Optional[ClueCandidate]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, answer, difficulty),
|
||||
candidate.topic_score,
|
||||
len(candidate.text),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def definition_for_word(
|
||||
word: str,
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> tuple[str, str]:
|
||||
answer = word.lower()
|
||||
entry = entries.get(answer, {})
|
||||
if not entry:
|
||||
return "Termine da ricavare dagli incroci.", "fallback"
|
||||
|
||||
normalized_difficulty = normalize_difficulty(difficulty)
|
||||
candidates = all_candidates(entry, answer, topic)
|
||||
best = choose_candidate(candidates, answer, normalized_difficulty)
|
||||
if best:
|
||||
return best.text, best.source
|
||||
|
||||
return fallback_definition(entry, answer), "fallback"
|
||||
|
||||
|
||||
def generate_clues(
|
||||
placements: Iterable[Placement],
|
||||
entries: Dict[str, Dict[str, object]],
|
||||
topic: Optional[str] = None,
|
||||
difficulty: Optional[str] = None,
|
||||
) -> List[Clue]:
|
||||
clues = []
|
||||
for number, placement in enumerate(placements, start=1):
|
||||
text, source = definition_for_word(placement.word, entries, topic, difficulty)
|
||||
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
|
||||
clues.append(
|
||||
Clue(
|
||||
number=number,
|
||||
word=placement.word,
|
||||
direction=direction,
|
||||
x=placement.x,
|
||||
y=placement.y,
|
||||
text=text,
|
||||
source=source,
|
||||
)
|
||||
)
|
||||
return clues
|
||||
55
create_passo3.bat
Normal file
55
create_passo3.bat
Normal file
@@ -0,0 +1,55 @@
|
||||
@echo off
|
||||
setlocal
|
||||
|
||||
cd /d "%~dp0"
|
||||
|
||||
set "BRANCH_NAME=passo3"
|
||||
set "COMMIT_MSG=feat: aggiunge CLI unificata, build vocabolario e filtro lessicale"
|
||||
|
||||
if not "%~1"=="" (
|
||||
set "COMMIT_MSG=%~1"
|
||||
)
|
||||
|
||||
echo Repository: %cd%
|
||||
echo Branch target: %BRANCH_NAME%
|
||||
echo Commit message: %COMMIT_MSG%
|
||||
echo.
|
||||
|
||||
git rev-parse --is-inside-work-tree >nul 2>nul
|
||||
if errorlevel 1 (
|
||||
echo Errore: questa cartella non e' un repository Git.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
|
||||
if errorlevel 1 (
|
||||
echo Creo il branch %BRANCH_NAME%...
|
||||
git checkout -b %BRANCH_NAME%
|
||||
) else (
|
||||
echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
|
||||
git checkout %BRANCH_NAME%
|
||||
)
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Aggiungo le modifiche...
|
||||
git add .
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Creo il commit...
|
||||
git commit -m "%COMMIT_MSG%"
|
||||
if errorlevel 1 (
|
||||
echo.
|
||||
echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo.
|
||||
echo Eseguo il push del branch %BRANCH_NAME%...
|
||||
git push -u origin %BRANCH_NAME%
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Operazione completata con successo.
|
||||
endlocal
|
||||
55
create_passo4.bat
Normal file
55
create_passo4.bat
Normal file
@@ -0,0 +1,55 @@
|
||||
@echo off
|
||||
setlocal
|
||||
|
||||
cd /d "%~dp0"
|
||||
|
||||
set "BRANCH_NAME=passo4"
|
||||
set "COMMIT_MSG=feat: consolida lessico semantico, temi controllati e filler a quota tematica"
|
||||
|
||||
if not "%~1"=="" (
|
||||
set "COMMIT_MSG=%~1"
|
||||
)
|
||||
|
||||
echo Repository: %cd%
|
||||
echo Branch target: %BRANCH_NAME%
|
||||
echo Commit message: %COMMIT_MSG%
|
||||
echo.
|
||||
|
||||
git rev-parse --is-inside-work-tree >nul 2>nul
|
||||
if errorlevel 1 (
|
||||
echo Errore: questa cartella non e' un repository Git.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
git show-ref --verify --quiet refs/heads/%BRANCH_NAME%
|
||||
if errorlevel 1 (
|
||||
echo Creo il branch %BRANCH_NAME%...
|
||||
git checkout -b %BRANCH_NAME%
|
||||
) else (
|
||||
echo Il branch %BRANCH_NAME% esiste gia', ci passo sopra...
|
||||
git checkout %BRANCH_NAME%
|
||||
)
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Aggiungo le modifiche di progetto, escludendo cache Python e cache API...
|
||||
git add *.py *.bat *.txt lexicon_it.json lexicon_it_semantic.json vocaboli_it_metadata.json package iwn-omw-main
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Creo il commit...
|
||||
git commit -m "%COMMIT_MSG%"
|
||||
if errorlevel 1 (
|
||||
echo.
|
||||
echo Nessun commit creato. Potrebbe non esserci nulla di nuovo da salvare.
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
echo.
|
||||
echo Eseguo il push del branch %BRANCH_NAME%...
|
||||
git push -u origin %BRANCH_NAME%
|
||||
if errorlevel 1 exit /b 1
|
||||
|
||||
echo.
|
||||
echo Operazione completata con successo.
|
||||
endlocal
|
||||
209
crossword_contract.md
Normal file
209
crossword_contract.md
Normal file
@@ -0,0 +1,209 @@
|
||||
# Contratto JSON del Cruciverba
|
||||
|
||||
Questo documento definisce il formato di scambio tra:
|
||||
|
||||
- `brain`: il motore che genera e compila il cruciverba
|
||||
- `client`: web app, backend, servizio PDF o altra macchina remota che richiede un cruciverba
|
||||
|
||||
L'obiettivo e' avere un payload:
|
||||
|
||||
- completo
|
||||
- stabile
|
||||
- espandibile
|
||||
- riusabile per stampa PDF, gioco web e archiviazione
|
||||
|
||||
## Flusso
|
||||
|
||||
1. Il client invia una `request` JSON al motore.
|
||||
2. Il motore risponde con una `response` JSON completa del cruciverba.
|
||||
3. Lo stesso JSON di risposta puo' essere:
|
||||
- salvato a database
|
||||
- convertito in PDF
|
||||
- renderizzato in una pagina web interattiva
|
||||
- riaperto in futuro senza rigenerare il cruciverba
|
||||
|
||||
## Principi di progettazione
|
||||
|
||||
- Ogni cruciverba ha un `crossword_id` univoco.
|
||||
- La `request` conserva i parametri di generazione originali.
|
||||
- La `response` include sia la griglia giocabile sia la soluzione.
|
||||
- Le parole hanno metadati ricchi: posizione, direzione, clue, tema, pos, fonte clue.
|
||||
- Le coordinate sono sempre assolute e 0-based nella griglia normalizzata esportata.
|
||||
- La griglia esportata e' rettangolare e normalizzata: niente coordinate negative.
|
||||
- Il formato supporta versioning con `schema_version`.
|
||||
|
||||
## Request
|
||||
|
||||
Campi principali:
|
||||
|
||||
- `schema_version`: versione del contratto
|
||||
- `request_id`: id della richiesta lato client
|
||||
- `requested_at`: timestamp ISO 8601
|
||||
- `generator`: configurazione del motore
|
||||
- `output`: preferenze di output
|
||||
- `client_context`: metadati opzionali del chiamante
|
||||
|
||||
### `generator`
|
||||
|
||||
- `topic`: stringa o lista di topic
|
||||
- `difficulty`: alias testuale
|
||||
- `seed`: opzionale, per riproducibilita'
|
||||
- `initial_word_count`
|
||||
- `themed_fill_count`
|
||||
- `target_empty_ratio`
|
||||
- `diffxy`
|
||||
- `time_limit_seconds`
|
||||
- `max_candidates_per_word`
|
||||
- `lexicon_file`
|
||||
- `definitions_enabled`
|
||||
- `definition_style`: per future varianti clue
|
||||
- `preferred_output_language`
|
||||
|
||||
### `output`
|
||||
|
||||
- `include_solution_grid`
|
||||
- `include_clue_sources`
|
||||
- `include_diagnostics`
|
||||
- `include_generation_log`
|
||||
- `format_hints`
|
||||
|
||||
## Response
|
||||
|
||||
Campi principali:
|
||||
|
||||
- `schema_version`
|
||||
- `request_id`
|
||||
- `crossword_id`
|
||||
- `generated_at`
|
||||
- `status`
|
||||
- `generator`
|
||||
- `summary`
|
||||
- `grid`
|
||||
- `entries`
|
||||
- `clues`
|
||||
- `solution`
|
||||
- `diagnostics`
|
||||
- `artifacts`
|
||||
|
||||
## Sezione `grid`
|
||||
|
||||
- `rows`
|
||||
- `cols`
|
||||
- `cell_size_hint`
|
||||
- `cells`
|
||||
|
||||
Ogni cella ha:
|
||||
|
||||
- `row`
|
||||
- `col`
|
||||
- `kind`: `block` oppure `letter`
|
||||
- `solution`
|
||||
- `display`
|
||||
- `number`: numero clue se la cella apre una parola
|
||||
- `across_entry_id`
|
||||
- `down_entry_id`
|
||||
- `is_prefilled`
|
||||
|
||||
Note:
|
||||
|
||||
- `solution` contiene sempre la lettera corretta per celle attive.
|
||||
- `display` e' vuoto per la scheda giocatore.
|
||||
- `number` serve per numerazione in stampa e web.
|
||||
|
||||
## Sezione `entries`
|
||||
|
||||
Ogni entry rappresenta una parola collocata in griglia.
|
||||
|
||||
Campi:
|
||||
|
||||
- `entry_id`
|
||||
- `number`
|
||||
- `direction`: `across` o `down`
|
||||
- `answer`
|
||||
- `answer_length`
|
||||
- `row`
|
||||
- `col`
|
||||
- `cells`: lista coordinate
|
||||
- `clue`
|
||||
- `clue_source`
|
||||
- `topics`
|
||||
- `pos`
|
||||
- `is_seed`
|
||||
- `added_by_filler`
|
||||
- `confidence`
|
||||
|
||||
## Sezione `clues`
|
||||
|
||||
Ridondante ma utile per client semplici.
|
||||
|
||||
- `across`: lista clues orizzontali
|
||||
- `down`: lista clues verticali
|
||||
|
||||
Ogni clue:
|
||||
|
||||
- `number`
|
||||
- `entry_id`
|
||||
- `text`
|
||||
- `enumeration`
|
||||
- `topic_match`
|
||||
- `source`
|
||||
|
||||
## Sezione `solution`
|
||||
|
||||
- `grid_rows`: lista di stringhe, una per riga
|
||||
- `words`: elenco risposte
|
||||
|
||||
`grid_rows` usa:
|
||||
|
||||
- lettera maiuscola per cella piena
|
||||
- `#` per casella nera
|
||||
|
||||
## Sezione `diagnostics`
|
||||
|
||||
Serve a tuning, benchmark e debug.
|
||||
|
||||
- `total_words`
|
||||
- `seed_words_requested`
|
||||
- `seed_words_placed`
|
||||
- `filler_words_added`
|
||||
- `intersections`
|
||||
- `filled_cells`
|
||||
- `empty_cells`
|
||||
- `empty_ratio`
|
||||
- `target_empty_ratio`
|
||||
- `topic_words`
|
||||
- `off_topic_words`
|
||||
- `pos_counts`
|
||||
- `runtime_lexicon`
|
||||
- `seed`
|
||||
- `generation_seconds`
|
||||
|
||||
## Sezione `artifacts`
|
||||
|
||||
URL o path futuri per file derivati.
|
||||
|
||||
- `pdf_player`
|
||||
- `pdf_solution`
|
||||
- `thumbnail`
|
||||
- `html_preview`
|
||||
|
||||
## Estensioni future previste
|
||||
|
||||
- `difficulty_profile`: facile/medio/difficile per definizioni separate
|
||||
- `hints`: aiuti progressivi per singola parola
|
||||
- `theme_story`: testo introduttivo del cruciverba
|
||||
- `player_state`: salvataggio partita in corso
|
||||
- `stats`: tempi, errori, percentuali di completamento
|
||||
|
||||
## Regola pratica consigliata
|
||||
|
||||
La macchina "brain" deve esporre almeno due endpoint logici:
|
||||
|
||||
- `POST /crosswords/generate`
|
||||
- input: request JSON
|
||||
- output: response JSON
|
||||
|
||||
- `GET /crosswords/{crossword_id}`
|
||||
- output: stessa response JSON salvata
|
||||
|
||||
In questo modo il contratto resta identico sia via file sia via webservice.
|
||||
37
crossword_contract_example_request.json
Normal file
37
crossword_contract_example_request.json
Normal file
@@ -0,0 +1,37 @@
|
||||
{
|
||||
"schema_version": "1.0",
|
||||
"request_id": "req-2026-04-28-0001",
|
||||
"requested_at": "2026-04-28T17:05:00+02:00",
|
||||
"generator": {
|
||||
"topic": [
|
||||
"transport"
|
||||
],
|
||||
"difficulty": "medium",
|
||||
"seed": 2,
|
||||
"initial_word_count": 19,
|
||||
"themed_fill_count": 10,
|
||||
"target_empty_ratio": 0.1667,
|
||||
"diffxy": 7,
|
||||
"time_limit_seconds": 8.0,
|
||||
"max_candidates_per_word": 12,
|
||||
"lexicon_file": "lexicon_it_curated_llm_aggressive.json",
|
||||
"definitions_enabled": true,
|
||||
"definition_style": "classic",
|
||||
"preferred_output_language": "it"
|
||||
},
|
||||
"output": {
|
||||
"include_solution_grid": true,
|
||||
"include_clue_sources": true,
|
||||
"include_diagnostics": true,
|
||||
"include_generation_log": false,
|
||||
"format_hints": {
|
||||
"pdf_page_size": "A4",
|
||||
"mobile_layout": true
|
||||
}
|
||||
},
|
||||
"client_context": {
|
||||
"channel": "web",
|
||||
"user_locale": "it-IT",
|
||||
"app_version": "alpha-1"
|
||||
}
|
||||
}
|
||||
138
crossword_contract_example_response.json
Normal file
138
crossword_contract_example_response.json
Normal file
@@ -0,0 +1,138 @@
|
||||
{
|
||||
"schema_version": "1.0",
|
||||
"request_id": "req-2026-04-28-0001",
|
||||
"crossword_id": "cw-2026-04-28-transport-0001",
|
||||
"generated_at": "2026-04-28T17:06:42+02:00",
|
||||
"status": "ok",
|
||||
"generator": {
|
||||
"topic": [
|
||||
"transport"
|
||||
],
|
||||
"difficulty": "medium",
|
||||
"seed": 2,
|
||||
"runtime_lexicon": "lexicon_it_curated_llm_aggressive.json"
|
||||
},
|
||||
"summary": {
|
||||
"title": "Cruciverba a tema trasporti",
|
||||
"subtitle": "Schema generato automaticamente",
|
||||
"rows": 12,
|
||||
"cols": 12,
|
||||
"total_words": 6,
|
||||
"intersections": 7
|
||||
},
|
||||
"grid": {
|
||||
"rows": 12,
|
||||
"cols": 12,
|
||||
"cell_size_hint": 32,
|
||||
"cells": [
|
||||
{
|
||||
"row": 0,
|
||||
"col": 0,
|
||||
"kind": "letter",
|
||||
"solution": "A",
|
||||
"display": "",
|
||||
"number": 1,
|
||||
"across_entry_id": "A1",
|
||||
"down_entry_id": null,
|
||||
"is_prefilled": false
|
||||
},
|
||||
{
|
||||
"row": 0,
|
||||
"col": 1,
|
||||
"kind": "letter",
|
||||
"solution": "M",
|
||||
"display": "",
|
||||
"number": null,
|
||||
"across_entry_id": "A1",
|
||||
"down_entry_id": "D2",
|
||||
"is_prefilled": false
|
||||
},
|
||||
{
|
||||
"row": 0,
|
||||
"col": 2,
|
||||
"kind": "block",
|
||||
"solution": null,
|
||||
"display": null,
|
||||
"number": null,
|
||||
"across_entry_id": null,
|
||||
"down_entry_id": null,
|
||||
"is_prefilled": false
|
||||
}
|
||||
]
|
||||
},
|
||||
"entries": [
|
||||
{
|
||||
"entry_id": "A1",
|
||||
"number": 1,
|
||||
"direction": "across",
|
||||
"answer": "AMBULANZA",
|
||||
"answer_length": 9,
|
||||
"row": 0,
|
||||
"col": 0,
|
||||
"cells": [
|
||||
[0, 0],
|
||||
[0, 1],
|
||||
[0, 2]
|
||||
],
|
||||
"clue": "Veicolo di soccorso sanitario.",
|
||||
"clue_source": "semantic_definition",
|
||||
"topics": [
|
||||
"transport",
|
||||
"health"
|
||||
],
|
||||
"pos": "NOUN",
|
||||
"is_seed": true,
|
||||
"added_by_filler": false,
|
||||
"confidence": 0.95
|
||||
}
|
||||
],
|
||||
"clues": {
|
||||
"across": [
|
||||
{
|
||||
"number": 1,
|
||||
"entry_id": "A1",
|
||||
"text": "Veicolo di soccorso sanitario.",
|
||||
"enumeration": 9,
|
||||
"topic_match": true,
|
||||
"source": "semantic_definition"
|
||||
}
|
||||
],
|
||||
"down": []
|
||||
},
|
||||
"solution": {
|
||||
"grid_rows": [
|
||||
"AM#ULA######",
|
||||
"##B#########"
|
||||
],
|
||||
"words": [
|
||||
"AMBULANZA"
|
||||
]
|
||||
},
|
||||
"diagnostics": {
|
||||
"seed_words_requested": 19,
|
||||
"seed_words_placed": 19,
|
||||
"filler_words_added": 5,
|
||||
"filled_cells": 84,
|
||||
"empty_cells": 18,
|
||||
"empty_ratio": 0.1765,
|
||||
"target_empty_ratio": 0.1667,
|
||||
"topic_words": 21,
|
||||
"off_topic_words": 3,
|
||||
"pos_counts": {
|
||||
"sostantivi": 20,
|
||||
"aggettivi": 2,
|
||||
"verbi": 1,
|
||||
"avverbi": 0,
|
||||
"preposizioni": 0,
|
||||
"congiunzioni": 0,
|
||||
"altri": 1
|
||||
},
|
||||
"generation_seconds": 124.6
|
||||
},
|
||||
"artifacts": {
|
||||
"pdf_player": null,
|
||||
"pdf_solution": null,
|
||||
"thumbnail": null,
|
||||
"html_preview": null
|
||||
}
|
||||
}
|
||||
523
crossword_filler.py
Normal file
523
crossword_filler.py
Normal file
@@ -0,0 +1,523 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import json
|
||||
from pathlib import Path
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||
|
||||
from crossword_generator import (
|
||||
DIFFXY,
|
||||
HORIZONTAL,
|
||||
VERTICAL,
|
||||
CrosswordGenerator,
|
||||
CrosswordState,
|
||||
Placement,
|
||||
WORDS,
|
||||
render_grid,
|
||||
)
|
||||
|
||||
|
||||
TARGET_EMPTY_RATIO = 1 / 6
|
||||
MIN_WORD_LENGTH = 2
|
||||
MAX_NO_PROGRESS_STEPS = 150
|
||||
MAX_SLOT_CANDIDATES = 8
|
||||
EXTENDED_VOCAB_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
|
||||
FILTERED_VOCAB_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
|
||||
METADATA_VOCAB_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
|
||||
VOCAB_PATH = (
|
||||
FILTERED_VOCAB_PATH
|
||||
if FILTERED_VOCAB_PATH.exists()
|
||||
else EXTENDED_VOCAB_PATH
|
||||
if EXTENDED_VOCAB_PATH.exists()
|
||||
else Path(__file__).with_name("vocaboli_it.txt")
|
||||
)
|
||||
|
||||
Coordinate = Tuple[int, int]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FillSlot:
|
||||
x: int
|
||||
y: int
|
||||
direction: str
|
||||
length: int
|
||||
pattern: str
|
||||
fixed_letters: int
|
||||
empty_cells: int
|
||||
candidate_count: int
|
||||
|
||||
@property
|
||||
def cells(self) -> List[Coordinate]:
|
||||
if self.direction == HORIZONTAL:
|
||||
return [(self.x + offset, self.y) for offset in range(self.length)]
|
||||
return [(self.x, self.y + offset) for offset in range(self.length)]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FillCandidate:
|
||||
word: str
|
||||
slot: FillSlot
|
||||
new_letters: int
|
||||
reused_letters: int
|
||||
local_score: Tuple[int, ...]
|
||||
|
||||
|
||||
class CrosswordFiller:
|
||||
def __init__(
|
||||
self,
|
||||
state: CrosswordState,
|
||||
vocabulary: Sequence[str],
|
||||
*,
|
||||
target_empty_ratio: float = TARGET_EMPTY_RATIO,
|
||||
vocabulary_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
||||
semantic_metadata: Optional[Dict[str, Dict[str, object]]] = None,
|
||||
selected_topic: str = "general",
|
||||
max_themed_fill_words: int = 10,
|
||||
seed: Optional[int] = None,
|
||||
) -> None:
|
||||
self.state = state.copy()
|
||||
self.initial_state = state.copy()
|
||||
self.target_empty_ratio = target_empty_ratio
|
||||
self.used_words: Set[str] = {placement.word for placement in self.state.placements}
|
||||
self.added_words: List[Placement] = []
|
||||
self.vocabulary = self._normalize_vocabulary(vocabulary)
|
||||
self.words_by_length = self._index_vocabulary(self.vocabulary)
|
||||
self.vocabulary_metadata = vocabulary_metadata or {}
|
||||
self.semantic_metadata = semantic_metadata or {}
|
||||
self.selected_topics = [
|
||||
topic.strip().lower()
|
||||
for topic in selected_topic.split(",")
|
||||
if topic.strip()
|
||||
] or ["general"]
|
||||
self.selected_topic = self.selected_topics[0]
|
||||
self.max_themed_fill_words = max(0, max_themed_fill_words)
|
||||
self.seed = seed
|
||||
self.rng = random.Random(seed)
|
||||
self.bounds = self._compute_bounds(self.state.grid)
|
||||
self.total_cells = self._area(self.bounds)
|
||||
self.target_empty_cells = max(0, int(round(self.total_cells * self.target_empty_ratio)))
|
||||
self.nodes_visited = 0
|
||||
self.last_spinner_update = 0.0
|
||||
self.spinner_frames = ["-", "/", "|", "\\"]
|
||||
self.spinner_index = 0
|
||||
self.started_at = 0.0
|
||||
self.last_word = "-"
|
||||
|
||||
@staticmethod
|
||||
def _normalize_vocabulary(words: Sequence[str]) -> List[str]:
|
||||
normalized: List[str] = []
|
||||
seen: Set[str] = set()
|
||||
for word in words:
|
||||
clean = word.strip().lower()
|
||||
if len(clean) < MIN_WORD_LENGTH or not clean.isalpha() or clean in seen:
|
||||
continue
|
||||
normalized.append(clean)
|
||||
seen.add(clean)
|
||||
return normalized
|
||||
|
||||
@staticmethod
|
||||
def _index_vocabulary(words: Sequence[str]) -> Dict[int, List[str]]:
|
||||
index: Dict[int, List[str]] = {}
|
||||
for word in words:
|
||||
index.setdefault(len(word), []).append(word)
|
||||
return index
|
||||
|
||||
@staticmethod
|
||||
def _compute_bounds(grid: Dict[Coordinate, str]) -> Tuple[int, int, int, int]:
|
||||
xs = [x for x, _ in grid]
|
||||
ys = [y for _, y in grid]
|
||||
return min(xs), min(ys), max(xs), max(ys)
|
||||
|
||||
@staticmethod
|
||||
def _area(bounds: Tuple[int, int, int, int]) -> int:
|
||||
x_min, y_min, x_max, y_max = bounds
|
||||
return (x_max - x_min + 1) * (y_max - y_min + 1)
|
||||
|
||||
def fill(self) -> CrosswordState:
|
||||
self.started_at = time.perf_counter()
|
||||
self.last_spinner_update = self.started_at
|
||||
no_progress_steps = 0
|
||||
|
||||
while self.empty_cells_count() > self.target_empty_cells and no_progress_steps < MAX_NO_PROGRESS_STEPS:
|
||||
self.nodes_visited += 1
|
||||
slots = self._collect_slots()
|
||||
self._tick_spinner(slots_count=len(slots))
|
||||
|
||||
if not slots:
|
||||
break
|
||||
|
||||
progress = False
|
||||
for slot in slots[:MAX_SLOT_CANDIDATES]:
|
||||
candidate = self._best_candidate_for_slot(slot)
|
||||
if candidate is None:
|
||||
continue
|
||||
self._apply_candidate(candidate)
|
||||
progress = True
|
||||
no_progress_steps = 0
|
||||
break
|
||||
|
||||
if not progress:
|
||||
no_progress_steps += 1
|
||||
|
||||
self._clear_spinner()
|
||||
return self.state
|
||||
|
||||
def empty_cells_count(self) -> int:
|
||||
x_min, y_min, x_max, y_max = self.bounds
|
||||
empty = 0
|
||||
for y in range(y_min, y_max + 1):
|
||||
for x in range(x_min, x_max + 1):
|
||||
if (x, y) not in self.state.grid:
|
||||
empty += 1
|
||||
return empty
|
||||
|
||||
def coverage_ratio(self) -> float:
|
||||
return 1.0 - (self.empty_cells_count() / self.total_cells)
|
||||
|
||||
def _collect_slots(self) -> List[FillSlot]:
|
||||
slots: List[FillSlot] = []
|
||||
x_min, y_min, x_max, y_max = self.bounds
|
||||
for y in range(y_min, y_max + 1):
|
||||
for x in range(x_min, x_max + 1):
|
||||
if (x, y) in self.state.grid:
|
||||
continue
|
||||
for direction in (HORIZONTAL, VERTICAL):
|
||||
slots.extend(self._slots_from_start(x, y, direction))
|
||||
|
||||
unique: Dict[Tuple[int, int, str, int], FillSlot] = {}
|
||||
for slot in slots:
|
||||
key = (slot.x, slot.y, slot.direction, slot.length)
|
||||
current = unique.get(key)
|
||||
if current is None or self._slot_priority(slot) > self._slot_priority(current):
|
||||
unique[key] = slot
|
||||
|
||||
collected = list(unique.values())
|
||||
collected.sort(key=self._slot_priority, reverse=True)
|
||||
if len(collected) > 1:
|
||||
top_slice = collected[: min(MAX_SLOT_CANDIDATES, len(collected))]
|
||||
self.rng.shuffle(top_slice)
|
||||
collected = top_slice + collected[min(MAX_SLOT_CANDIDATES, len(collected)) :]
|
||||
return collected
|
||||
|
||||
def _slots_from_start(self, x: int, y: int, direction: str) -> Iterable[FillSlot]:
|
||||
dx, dy = (1, 0) if direction == HORIZONTAL else (0, 1)
|
||||
x_min, y_min, x_max, y_max = self.bounds
|
||||
|
||||
prev_cell = (x - dx, y - dy)
|
||||
if self._inside_bounds(prev_cell) and prev_cell in self.state.grid:
|
||||
return []
|
||||
|
||||
max_length = 0
|
||||
cursor_x = x
|
||||
cursor_y = y
|
||||
while x_min <= cursor_x <= x_max and y_min <= cursor_y <= y_max:
|
||||
max_length += 1
|
||||
cursor_x += dx
|
||||
cursor_y += dy
|
||||
|
||||
slots: List[FillSlot] = []
|
||||
for length in range(max_length, MIN_WORD_LENGTH - 1, -1):
|
||||
end_cell = (x + dx * length, y + dy * length)
|
||||
if self._inside_bounds(end_cell) and end_cell in self.state.grid:
|
||||
continue
|
||||
|
||||
pattern_chars: List[str] = []
|
||||
fixed_letters = 0
|
||||
empty_cells = 0
|
||||
for offset in range(length):
|
||||
cell = (x + dx * offset, y + dy * offset)
|
||||
letter = self.state.grid.get(cell)
|
||||
if letter is None:
|
||||
pattern_chars.append(".")
|
||||
empty_cells += 1
|
||||
else:
|
||||
pattern_chars.append(letter)
|
||||
fixed_letters += 1
|
||||
|
||||
if empty_cells == 0:
|
||||
continue
|
||||
|
||||
pattern = "".join(pattern_chars)
|
||||
candidate_count = self._count_candidates(pattern)
|
||||
if candidate_count == 0:
|
||||
continue
|
||||
|
||||
slots.append(
|
||||
FillSlot(
|
||||
x=x,
|
||||
y=y,
|
||||
direction=direction,
|
||||
length=length,
|
||||
pattern=pattern,
|
||||
fixed_letters=fixed_letters,
|
||||
empty_cells=empty_cells,
|
||||
candidate_count=candidate_count,
|
||||
)
|
||||
)
|
||||
|
||||
return slots
|
||||
|
||||
def _slot_priority(self, slot: FillSlot) -> Tuple[int, int, int, int, int]:
|
||||
return (
|
||||
slot.fixed_letters,
|
||||
-slot.candidate_count,
|
||||
slot.length,
|
||||
-slot.empty_cells,
|
||||
1 if slot.direction == HORIZONTAL else 0,
|
||||
)
|
||||
|
||||
def _count_candidates(self, pattern: str) -> int:
|
||||
count = 0
|
||||
for word in self.words_by_length.get(len(pattern), []):
|
||||
if word in self.used_words:
|
||||
continue
|
||||
if self._matches_pattern(word, pattern):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
@staticmethod
|
||||
def _matches_pattern(word: str, pattern: str) -> bool:
|
||||
return all(p == "." or p == w for w, p in zip(word, pattern))
|
||||
|
||||
def _best_candidate_for_slot(self, slot: FillSlot) -> Optional[FillCandidate]:
|
||||
candidates: List[FillCandidate] = []
|
||||
for word in self.words_by_length.get(slot.length, []):
|
||||
if word in self.used_words or not self._matches_pattern(word, slot.pattern):
|
||||
continue
|
||||
if not self._placement_is_valid(slot, word):
|
||||
continue
|
||||
new_letters = sum(1 for cell in slot.cells if cell not in self.state.grid)
|
||||
reused_letters = slot.fixed_letters
|
||||
local_score = (
|
||||
self._semantic_topic_score(word),
|
||||
reused_letters,
|
||||
new_letters,
|
||||
self._word_quality(word),
|
||||
self._semantic_quality(word),
|
||||
len(set(word)),
|
||||
)
|
||||
candidates.append(
|
||||
FillCandidate(
|
||||
word=word,
|
||||
slot=slot,
|
||||
new_letters=new_letters,
|
||||
reused_letters=reused_letters,
|
||||
local_score=local_score,
|
||||
)
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
candidates.sort(key=lambda item: item.local_score, reverse=True)
|
||||
return self.rng.choice(candidates[: min(3, len(candidates))])
|
||||
|
||||
def _word_quality(self, word: str) -> int:
|
||||
metadata = self.vocabulary_metadata.get(word)
|
||||
if not metadata:
|
||||
return 0
|
||||
try:
|
||||
return int(metadata.get("quality", 0))
|
||||
except (TypeError, ValueError):
|
||||
return 0
|
||||
|
||||
def _semantic_entry(self, word: str) -> Dict[str, object]:
|
||||
return self.semantic_metadata.get(word, {})
|
||||
|
||||
def _semantic_quality(self, word: str) -> int:
|
||||
entry = self._semantic_entry(word)
|
||||
semantic = entry.get("semantic", {})
|
||||
score = 0
|
||||
if semantic.get("matched"):
|
||||
score += 2
|
||||
score += min(3, len(semantic.get("glosses", [])))
|
||||
score += min(2, len(semantic.get("synonyms", [])))
|
||||
return score
|
||||
|
||||
def _semantic_topic_score(self, word: str) -> int:
|
||||
if not self.selected_topics or self.selected_topics == ["general"]:
|
||||
return 0
|
||||
|
||||
entry = self._semantic_entry(word)
|
||||
try:
|
||||
relevance = int(entry.get("_topic_relevance", 0))
|
||||
except (TypeError, ValueError):
|
||||
relevance = 0
|
||||
if relevance:
|
||||
if self._themed_added_count() < self.max_themed_fill_words:
|
||||
return relevance
|
||||
return min(relevance, 10)
|
||||
|
||||
topics = {str(item).lower() for item in entry.get("topics", [])}
|
||||
semantic = entry.get("semantic", {})
|
||||
semantic_topics = {str(item).lower() for item in semantic.get("semantic_topics", [])}
|
||||
score = 0
|
||||
if any(topic in topics for topic in self.selected_topics):
|
||||
score += 4
|
||||
if any(topic in semantic_topics for topic in self.selected_topics):
|
||||
score += 6
|
||||
if "general" in topics:
|
||||
score += 1
|
||||
return score
|
||||
|
||||
def _themed_added_count(self) -> int:
|
||||
total = 0
|
||||
for placement in self.added_words:
|
||||
entry = self._semantic_entry(placement.word)
|
||||
try:
|
||||
if int(entry.get("_strong_topic_relevance", 0)) > 0:
|
||||
total += 1
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return total
|
||||
|
||||
def _placement_is_valid(self, slot: FillSlot, word: str) -> bool:
|
||||
dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
|
||||
before = (slot.x - dx, slot.y - dy)
|
||||
after = (slot.x + dx * slot.length, slot.y + dy * slot.length)
|
||||
if self._inside_bounds(before) and before in self.state.grid:
|
||||
return False
|
||||
if self._inside_bounds(after) and after in self.state.grid:
|
||||
return False
|
||||
|
||||
intersects_existing = False
|
||||
for offset, cell in enumerate(slot.cells):
|
||||
current = self.state.grid.get(cell)
|
||||
letter = word[offset]
|
||||
if current is not None and current != letter:
|
||||
return False
|
||||
if current == letter:
|
||||
intersects_existing = True
|
||||
|
||||
return intersects_existing or slot.fixed_letters == 0
|
||||
|
||||
def _apply_candidate(self, candidate: FillCandidate) -> None:
|
||||
slot = candidate.slot
|
||||
dx, dy = (1, 0) if slot.direction == HORIZONTAL else (0, 1)
|
||||
intersections = 0
|
||||
for offset, letter in enumerate(candidate.word):
|
||||
cell = (slot.x + dx * offset, slot.y + dy * offset)
|
||||
if cell in self.state.grid:
|
||||
intersections += 1
|
||||
self.state.grid[cell] = letter
|
||||
|
||||
placement = Placement(
|
||||
word=candidate.word,
|
||||
x=slot.x,
|
||||
y=slot.y,
|
||||
direction=slot.direction,
|
||||
intersections=intersections,
|
||||
)
|
||||
self.state.placements.append(placement)
|
||||
self.state.intersections += intersections
|
||||
self.used_words.add(candidate.word)
|
||||
self.added_words.append(placement)
|
||||
self.last_word = candidate.word
|
||||
print(
|
||||
f"\n[fill] inserita '{candidate.word}' "
|
||||
f"in {slot.direction} da ({slot.x}, {slot.y}), "
|
||||
f"nuove={candidate.new_letters}, intersezioni={intersections}, "
|
||||
f"copertura={self.coverage_ratio() * 100:.1f}%",
|
||||
file=sys.stderr,
|
||||
flush=True,
|
||||
)
|
||||
|
||||
def _inside_bounds(self, cell: Coordinate) -> bool:
|
||||
x_min, y_min, x_max, y_max = self.bounds
|
||||
x, y = cell
|
||||
return x_min <= x <= x_max and y_min <= y <= y_max
|
||||
|
||||
def _tick_spinner(self, *, slots_count: int) -> None:
|
||||
now = time.perf_counter()
|
||||
if now - self.last_spinner_update < 0.08:
|
||||
return
|
||||
|
||||
frame = self.spinner_frames[self.spinner_index]
|
||||
elapsed = now - self.started_at
|
||||
message = (
|
||||
f"\r{frame} fill... "
|
||||
f"slot={slots_count} "
|
||||
f"vuote={self.empty_cells_count()}/{self.total_cells} "
|
||||
f"target={self.target_empty_cells} "
|
||||
f"aggiunte={len(self.added_words)} "
|
||||
f"tema={self._themed_added_count()}/{self.max_themed_fill_words} "
|
||||
f"ultima={self.last_word} "
|
||||
f"t={elapsed:0.1f}s"
|
||||
)
|
||||
print(message, end="", file=sys.stderr, flush=True)
|
||||
self.spinner_index = (self.spinner_index + 1) % len(self.spinner_frames)
|
||||
self.last_spinner_update = now
|
||||
|
||||
@staticmethod
|
||||
def _clear_spinner() -> None:
|
||||
print("\r" + " " * 140 + "\r", end="", file=sys.stderr, flush=True)
|
||||
|
||||
|
||||
def load_vocabulary(path: Path = VOCAB_PATH) -> List[str]:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Vocabolario non trovato: {path}")
|
||||
return path.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
|
||||
def load_vocabulary_metadata(path: Path = METADATA_VOCAB_PATH) -> Dict[str, Dict[str, object]]:
|
||||
if not path.exists():
|
||||
return {}
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def summarize_fill(initial_state: CrosswordState, final_state: CrosswordState) -> str:
|
||||
initial_bounds = (
|
||||
initial_state.width(),
|
||||
initial_state.height(),
|
||||
initial_state.shape_difference(),
|
||||
)
|
||||
final_bounds = (
|
||||
final_state.width(),
|
||||
final_state.height(),
|
||||
final_state.shape_difference(),
|
||||
)
|
||||
return (
|
||||
f"Riempimento completato\n"
|
||||
f"Parole iniziali: {initial_state.placed_words}\n"
|
||||
f"Parole finali: {final_state.placed_words}\n"
|
||||
f"Intersezioni finali: {final_state.intersections}\n"
|
||||
f"Dimensioni iniziali: {initial_bounds[0]}x{initial_bounds[1]} (diff={initial_bounds[2]})\n"
|
||||
f"Dimensioni finali: {final_bounds[0]}x{final_bounds[1]} (diff={final_bounds[2]})\n"
|
||||
f"Celle vuote residue: {sum(1 for _ in iter_empty_cells(final_state))}"
|
||||
)
|
||||
|
||||
|
||||
def iter_empty_cells(state: CrosswordState) -> Iterable[Coordinate]:
|
||||
x_min, y_min, x_max, y_max = state.bounds()
|
||||
for y in range(y_min, y_max + 1):
|
||||
for x in range(x_min, x_max + 1):
|
||||
if (x, y) not in state.grid:
|
||||
yield (x, y)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
vocabulary = load_vocabulary()
|
||||
vocabulary_metadata = load_vocabulary_metadata()
|
||||
|
||||
generator = CrosswordGenerator(WORDS, diffxy=DIFFXY)
|
||||
initial_state = generator.solve()
|
||||
|
||||
filler = CrosswordFiller(initial_state, vocabulary, vocabulary_metadata=vocabulary_metadata)
|
||||
final_state = filler.fill()
|
||||
|
||||
print(summarize_fill(initial_state, final_state))
|
||||
print()
|
||||
print(render_grid(final_state.grid, final_state.placements))
|
||||
if filler.added_words:
|
||||
print()
|
||||
print("Parole aggiunte dal filler:")
|
||||
for index, placement in enumerate(filler.added_words, start=1):
|
||||
direction = "orizzontale" if placement.direction == HORIZONTAL else "verticale"
|
||||
print(f"{index:>2}. {placement.word} ({placement.x}, {placement.y}) {direction}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,6 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
import locale
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Set, Tuple
|
||||
@@ -36,6 +38,8 @@ WORDS = [
|
||||
HORIZONTAL = "H"
|
||||
VERTICAL = "V"
|
||||
DIFFXY = 7
|
||||
EMPTY_CELL_RENDER = "□"
|
||||
EMPTY_CELL_FALLBACK = "#"
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -111,6 +115,7 @@ class CrosswordGenerator:
|
||||
max_candidates_per_word: int = 12,
|
||||
time_limit_seconds: float = 8.0,
|
||||
diffxy: int = DIFFXY,
|
||||
seed: Optional[int] = None,
|
||||
) -> None:
|
||||
normalized = [self._normalize(word) for word in words]
|
||||
unique_words = list(dict.fromkeys(word for word in normalized if len(word) >= 2))
|
||||
@@ -119,6 +124,8 @@ class CrosswordGenerator:
|
||||
self.max_candidates_per_word = max_candidates_per_word
|
||||
self.time_limit_seconds = time_limit_seconds
|
||||
self.diffxy = diffxy
|
||||
self.seed = seed
|
||||
self.rng = random.Random(seed)
|
||||
self.started_at = 0.0
|
||||
self.visited: Dict[Tuple[frozenset, Tuple[str, ...]], Tuple[int, int, int]] = {}
|
||||
self.nodes_visited = 0
|
||||
@@ -210,6 +217,8 @@ class CrosswordGenerator:
|
||||
reverse=True,
|
||||
)
|
||||
candidates = candidates[: self.max_candidates_per_word]
|
||||
if len(candidates) > 1:
|
||||
self.rng.shuffle(candidates)
|
||||
|
||||
next_remaining = [word for word in remaining_words if word != next_word]
|
||||
for placement in candidates:
|
||||
@@ -250,6 +259,10 @@ class CrosswordGenerator:
|
||||
word,
|
||||
),
|
||||
)
|
||||
if len(ranked_words) > 1:
|
||||
top_slice = ranked_words[: min(5, len(ranked_words))]
|
||||
self.rng.shuffle(top_slice)
|
||||
ranked_words = top_slice + ranked_words[min(5, len(ranked_words)) :]
|
||||
|
||||
best_word = ranked_words[0]
|
||||
best_key: Optional[Tuple[int, int, int, str]] = None
|
||||
@@ -361,6 +374,9 @@ def render_grid(grid: Grid, placements: Sequence[Placement]) -> str:
|
||||
if not grid:
|
||||
return "(griglia vuota)"
|
||||
|
||||
encoding = (getattr(sys.stdout, "encoding", None) or locale.getpreferredencoding(False) or "").lower()
|
||||
empty_cell = EMPTY_CELL_RENDER if "utf" in encoding else EMPTY_CELL_FALLBACK
|
||||
|
||||
x_min = min(x for x, _ in grid)
|
||||
x_max = max(x for x, _ in grid)
|
||||
y_min = min(y for _, y in grid)
|
||||
@@ -373,7 +389,7 @@ def render_grid(grid: Grid, placements: Sequence[Placement]) -> str:
|
||||
for y in range(y_min, y_max + 1):
|
||||
row = [f"{y:>3} "]
|
||||
for x in range(x_min, x_max + 1):
|
||||
row.append(grid.get((x, y), ".").upper().rjust(2))
|
||||
row.append(grid.get((x, y), empty_cell).upper().rjust(2))
|
||||
lines.append(" ".join(row))
|
||||
|
||||
lines.append("")
|
||||
|
||||
611
curate_lexicon_alpha.py
Normal file
611
curate_lexicon_alpha.py
Normal file
@@ -0,0 +1,611 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
CURATED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_curated.json")
|
||||
TO_BE_REVIEW_OUTPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
|
||||
DIFFICULTIES = ("easy", "medium", "hard", "expert")
|
||||
|
||||
TEXT_REPLACEMENTS = {
|
||||
" ngrandimento": " ingrandimento",
|
||||
"superificie": "superficie",
|
||||
"quantitaaa": "quantità",
|
||||
"quantitaaaa": "quantità",
|
||||
"quantit": "quantità",
|
||||
"sanit_militare": "sanità_militare",
|
||||
" unaparola ": " una parola ",
|
||||
"questa parola, ": "",
|
||||
"questa parola; ": "",
|
||||
}
|
||||
|
||||
SUSPICIOUS_PROPER_PATTERNS = (
|
||||
r"\bepisodio\b",
|
||||
r"\bfilm\b",
|
||||
r"\bserie tv\b",
|
||||
r"\bfamiglia\b",
|
||||
r"\bcomune italiano\b",
|
||||
r"\bfrazione del comune\b",
|
||||
r"\bcitta metropolitana\b",
|
||||
r"\bpersonaggio\b",
|
||||
r"\balbum\b",
|
||||
r"\bcognome\b",
|
||||
r"\bnome proprio\b",
|
||||
)
|
||||
|
||||
DOMAIN_HINTS = {
|
||||
"religion": ("monastero", "abbazia", "sacerdot", "prete", "vescovo", "clero", "religios"),
|
||||
"transport": ("veicolo", "motore", "aereo", "treno", "nave", "trasport", "rimorch", "reattor"),
|
||||
"health": ("malat", "ferit", "ospedal", "medic", "sanitar", "cura", "paziente"),
|
||||
"nature": ("animale", "pianta", "mare", "bosco", "albero", "fiore", "montagna", "acque", "salate"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "isola", "citta", "territorio"),
|
||||
"sea": ("acque", "salate", "superficie terrestre", "oceano"),
|
||||
}
|
||||
|
||||
ABSTRACT_PATTERNS = (
|
||||
r"\bgrande quantita\b",
|
||||
r"\bfigurato\b",
|
||||
r"\bsenso figurato\b",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Cura il lessico refined per la milestone alpha e separa i casi dubbi in to_be_review.json."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=CURATED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico curated da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-output",
|
||||
type=Path,
|
||||
default=TO_BE_REVIEW_OUTPUT_PATH,
|
||||
help="File JSON con le voci che richiedono revisione umana.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-review",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite opzionale di voci da esportare in to_be_review.json. 0 = tutte.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def ascii_fold(text: str) -> str:
|
||||
replacements = str.maketrans(
|
||||
{
|
||||
"à": "a",
|
||||
"á": "a",
|
||||
"è": "e",
|
||||
"é": "e",
|
||||
"ì": "i",
|
||||
"í": "i",
|
||||
"ò": "o",
|
||||
"ó": "o",
|
||||
"ù": "u",
|
||||
"ú": "u",
|
||||
}
|
||||
)
|
||||
return str(text).translate(replacements)
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return ""
|
||||
for old, new in TEXT_REPLACEMENTS.items():
|
||||
value = value.replace(old, new)
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
value = re.sub(r"\s*;\s*", "; ", value)
|
||||
value = re.sub(r"\s*,\s*", ", ", value)
|
||||
value = value.strip(" .;:-")
|
||||
if value and value[0].islower():
|
||||
value = value[0].upper() + value[1:]
|
||||
return value + "."
|
||||
|
||||
|
||||
def split_definition_text(text: str) -> List[str]:
|
||||
value = str(text or "").strip()
|
||||
if not value:
|
||||
return []
|
||||
pieces = re.split(r"\s*;\s+|\.\s+(?=[a-zàèéìòù])", value, flags=re.IGNORECASE)
|
||||
normalized = []
|
||||
for piece in pieces:
|
||||
cleaned = normalize_text(piece)
|
||||
if cleaned:
|
||||
normalized.append(cleaned)
|
||||
return normalized
|
||||
|
||||
|
||||
def entry_is_common_word(entry: Dict[str, object]) -> bool:
|
||||
form = str(entry.get("form", ""))
|
||||
return bool(form) and form[:1].islower() and not (entry.get("name_tags") or [])
|
||||
|
||||
|
||||
def definition_mentions_answer(text: str, answer: str) -> bool:
|
||||
normalized_text = ascii_fold(text).lower()
|
||||
normalized_answer = ascii_fold(answer).lower()
|
||||
return bool(re.search(re.escape(normalized_answer), normalized_text))
|
||||
|
||||
|
||||
def suspicious_proper_noun_definition(text: str, entry: Dict[str, object]) -> bool:
|
||||
if not entry_is_common_word(entry):
|
||||
return False
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in SUSPICIOUS_PROPER_PATTERNS)
|
||||
|
||||
|
||||
def likely_abstract_detour(text: str) -> bool:
|
||||
lowered = ascii_fold(text).lower()
|
||||
return any(re.search(pattern, lowered) for pattern in ABSTRACT_PATTERNS)
|
||||
|
||||
|
||||
def semantic_topics(entry: Dict[str, object]) -> List[str]:
|
||||
semantic = entry.get("semantic", {})
|
||||
topics = []
|
||||
if isinstance(semantic, dict):
|
||||
topics.extend(str(item).lower() for item in semantic.get("semantic_topics", []) or [])
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
topics.extend(str(item).lower() for item in wiktextract.get("topic_hints", []) or [])
|
||||
return dedupe(topics)
|
||||
|
||||
|
||||
def lexical_topics(entry: Dict[str, object]) -> List[str]:
|
||||
return [str(item).lower() for item in entry.get("topics", []) or [] if item]
|
||||
|
||||
|
||||
def topic_alignment_score(text: str, entry: Dict[str, object]) -> int:
|
||||
lowered = ascii_fold(text).lower()
|
||||
score = 0
|
||||
topics = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
for topic in topics:
|
||||
for hint in DOMAIN_HINTS.get(topic, ()):
|
||||
if hint in lowered:
|
||||
score += 16
|
||||
return score
|
||||
|
||||
|
||||
def candidate_style(text: str) -> str:
|
||||
lowered = ascii_fold(text).lower()
|
||||
if ";" in text or len(text) > 90:
|
||||
return "direct"
|
||||
if any(marker in lowered for marker in ("chi ", "che ", "strumento", "veicolo", "titolo", "parte di")):
|
||||
return "balanced"
|
||||
return "oblique"
|
||||
|
||||
|
||||
def length_window(difficulty: str) -> Tuple[int, int]:
|
||||
if difficulty == "easy":
|
||||
return 18, 90
|
||||
if difficulty == "medium":
|
||||
return 18, 78
|
||||
if difficulty == "hard":
|
||||
return 14, 62
|
||||
return 12, 55
|
||||
|
||||
|
||||
def build_candidate(
|
||||
text: str,
|
||||
*,
|
||||
source: str,
|
||||
family: str,
|
||||
confidence: float,
|
||||
priority: int = 0,
|
||||
) -> Dict[str, object]:
|
||||
cleaned = normalize_text(text)
|
||||
return {
|
||||
"text": cleaned,
|
||||
"source": source,
|
||||
"family": family,
|
||||
"confidence": confidence,
|
||||
"style": candidate_style(cleaned),
|
||||
"priority": priority,
|
||||
}
|
||||
|
||||
|
||||
def collect_candidates(entry: Dict[str, object]) -> List[Dict[str, object]]:
|
||||
candidates: List[Dict[str, object]] = []
|
||||
seen = set()
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for index, synset in enumerate(semantic.get("synsets", []) or []):
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(synset.get("definition", ""))):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic",
|
||||
family="semantic_definition",
|
||||
confidence=0.9,
|
||||
priority=max(0, 100 - index * 12),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
for index, gloss in enumerate(semantic.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="semantic_gloss",
|
||||
family="semantic_gloss",
|
||||
confidence=0.8,
|
||||
priority=max(0, 90 - index * 10),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
for index, sense in enumerate(entry.get("senses", []) or []):
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for piece in split_definition_text(str(sense.get("definition", ""))):
|
||||
source = str(sense.get("source", "refined"))
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="refined" if source == "semantic" else source,
|
||||
family="refined_sense",
|
||||
confidence=float(sense.get("confidence", 0.7) or 0.7),
|
||||
priority=max(0, 80 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
confidence = 0.85 if babelnet.get("status") == "enriched" else 0.55
|
||||
for index, gloss in enumerate(best_synset.get("glosses", []) or []):
|
||||
for piece in split_definition_text(str(gloss)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="babelnet",
|
||||
family="babelnet_gloss",
|
||||
confidence=confidence,
|
||||
priority=max(0, 60 - index * 8),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
if isinstance(wiktextract, dict):
|
||||
definitions = wiktextract.get("definitions", []) or []
|
||||
confidence = 0.78 if wiktextract.get("matched") else 0.45
|
||||
for index, definition in enumerate(definitions):
|
||||
for piece in split_definition_text(str(definition)):
|
||||
candidate = build_candidate(
|
||||
piece,
|
||||
source="wiktextract",
|
||||
family="wiktextract_definition",
|
||||
confidence=confidence,
|
||||
priority=max(0, 88 - index * 9),
|
||||
)
|
||||
key = (candidate["text"].lower(), candidate["family"])
|
||||
if candidate["text"] and key not in seen:
|
||||
seen.add(key)
|
||||
candidates.append(candidate)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def score_candidate(candidate: Dict[str, object], entry: Dict[str, object], difficulty: str) -> int:
|
||||
text = str(candidate["text"])
|
||||
answer = str(entry.get("form", "")).lower()
|
||||
score = 0
|
||||
|
||||
source = str(candidate.get("source"))
|
||||
family = str(candidate.get("family"))
|
||||
confidence = float(candidate.get("confidence", 0.0) or 0.0)
|
||||
|
||||
if len(text) < 12:
|
||||
return -10_000
|
||||
|
||||
if definition_mentions_answer(text, answer):
|
||||
score -= 140
|
||||
else:
|
||||
score += 30
|
||||
|
||||
if suspicious_proper_noun_definition(text, entry):
|
||||
score -= 220
|
||||
|
||||
if likely_abstract_detour(text):
|
||||
score -= 80
|
||||
|
||||
min_len, max_len = length_window(difficulty)
|
||||
if min_len <= len(text) <= max_len:
|
||||
score += 24
|
||||
else:
|
||||
score -= abs(len(text) - max_len) if len(text) > max_len else abs(min_len - len(text)) // 2
|
||||
|
||||
source_bonus = {
|
||||
"semantic": 55,
|
||||
"semantic_gloss": 40,
|
||||
"babelnet": 24,
|
||||
"refined": 30,
|
||||
"wiktextract": 52,
|
||||
}
|
||||
score += source_bonus.get(source, 10)
|
||||
|
||||
family_bonus = {
|
||||
"semantic_definition": 30,
|
||||
"semantic_gloss": 18,
|
||||
"babelnet_gloss": 8,
|
||||
"refined_sense": 22,
|
||||
"wiktextract_definition": 28,
|
||||
}
|
||||
score += family_bonus.get(family, 0)
|
||||
|
||||
score += int(candidate.get("priority", 0) or 0)
|
||||
score += int(confidence * 35)
|
||||
|
||||
alignment = topic_alignment_score(text, entry)
|
||||
score += alignment
|
||||
topical = set(lexical_topics(entry)) | set(semantic_topics(entry))
|
||||
concrete_topics = topical.intersection({"religion", "transport", "health", "nature", "geography", "sea"})
|
||||
if concrete_topics and alignment == 0:
|
||||
score -= 45
|
||||
|
||||
style = str(candidate.get("style"))
|
||||
if difficulty == "easy" and style == "direct":
|
||||
score += 16
|
||||
elif difficulty == "medium" and style in {"direct", "balanced"}:
|
||||
score += 14
|
||||
elif difficulty == "hard" and style == "balanced":
|
||||
score += 10
|
||||
elif difficulty == "expert" and style == "oblique":
|
||||
score += 10
|
||||
|
||||
if difficulty in {"easy", "medium"} and re.search(r"\((?:mil|fig|lett|fam)\.\)", text.lower()):
|
||||
score -= 30
|
||||
if difficulty in {"hard", "expert"} and ";" in text:
|
||||
score -= 10
|
||||
|
||||
if entry.get("needs_review"):
|
||||
score -= 8
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def choose_best_candidate(
|
||||
candidates: Sequence[Dict[str, object]],
|
||||
entry: Dict[str, object],
|
||||
difficulty: str,
|
||||
) -> Optional[Dict[str, object]]:
|
||||
ranked = sorted(
|
||||
candidates,
|
||||
key=lambda candidate: (
|
||||
score_candidate(candidate, entry, difficulty),
|
||||
float(candidate.get("confidence", 0.0)),
|
||||
float(candidate.get("priority", 0.0)),
|
||||
-len(str(candidate.get("text", ""))),
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return ranked[0] if ranked else None
|
||||
|
||||
|
||||
def review_reasons(entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[str]:
|
||||
reasons: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
lowered_topics = set(lexical_topics(entry))
|
||||
semantic_topic_set = set(semantic_topics(entry))
|
||||
babelnet_status = str((entry.get("babelnet") or {}).get("status", ""))
|
||||
wiktextract = entry.get("wiktextract", {})
|
||||
wiktextract_status = str(wiktextract.get("status", "")) if isinstance(wiktextract, dict) else ""
|
||||
preferred_definition = str(entry.get("preferred_definition", ""))
|
||||
preferred_source = str(entry.get("preferred_source", ""))
|
||||
|
||||
if not candidates:
|
||||
reasons.append("no_viable_definition")
|
||||
if not preferred_definition and entry.get("needs_review"):
|
||||
reasons.append("flagged_by_refined_stage")
|
||||
if preferred_definition and suspicious_proper_noun_definition(preferred_definition, entry):
|
||||
reasons.append("proper_noun_collision")
|
||||
if babelnet_status == "ambiguous" and preferred_source == "babelnet":
|
||||
reasons.append("babelnet_ambiguous")
|
||||
if wiktextract_status in {"missing", "no_match"} and not preferred_definition:
|
||||
reasons.append("wiktextract_missing")
|
||||
if lowered_topics == {"general"} and not semantic_topic_set and not preferred_definition:
|
||||
reasons.append("only_general_topics")
|
||||
if len(form) <= 2:
|
||||
reasons.append("very_short_word")
|
||||
if str(entry.get("pos", "")) in {"PREP", "CONJ"}:
|
||||
reasons.append("function_word")
|
||||
if preferred_source == "babelnet" and any("None" in str(sense.get("topics")) for sense in entry.get("senses", []) if isinstance(sense, dict)):
|
||||
reasons.append("unresolved_sense_topics")
|
||||
if preferred_definition and definition_mentions_answer(preferred_definition, form.lower()):
|
||||
reasons.append("candidate_mentions_answer")
|
||||
|
||||
return dedupe(reasons)
|
||||
|
||||
|
||||
def curate_entry(entry: Dict[str, object]) -> Tuple[Dict[str, object], Optional[Dict[str, object]]]:
|
||||
curated = deepcopy(entry)
|
||||
candidates = collect_candidates(curated)
|
||||
|
||||
clue_definitions: Dict[str, str] = {}
|
||||
clue_sources: Dict[str, str] = {}
|
||||
clue_scores: Dict[str, int] = {}
|
||||
curation_notes: List[str] = []
|
||||
|
||||
for difficulty in DIFFICULTIES:
|
||||
best = choose_best_candidate(candidates, curated, difficulty)
|
||||
if best:
|
||||
clue_definitions[difficulty] = str(best["text"])
|
||||
clue_sources[difficulty] = str(best["source"])
|
||||
clue_scores[difficulty] = score_candidate(best, curated, difficulty)
|
||||
|
||||
preferred_definition = clue_definitions.get("medium") or clue_definitions.get("easy") or ""
|
||||
preferred_source = clue_sources.get("medium") or clue_sources.get("easy") or "fallback"
|
||||
|
||||
if preferred_definition:
|
||||
curation_notes.append(f"preferred_from={preferred_source}")
|
||||
if clue_scores.get("medium", -9999) < 20:
|
||||
curation_notes.append("weak_medium_definition")
|
||||
|
||||
curated["curated_glosses"] = dedupe(candidate["text"] for candidate in candidates)
|
||||
curated["curated_senses"] = [
|
||||
{
|
||||
"definition": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates
|
||||
]
|
||||
curated["preferred_definition"] = preferred_definition
|
||||
curated["preferred_source"] = preferred_source
|
||||
curated["clue_definitions"] = clue_definitions
|
||||
curated["clue_sources"] = clue_sources
|
||||
curated["clue_scores"] = clue_scores
|
||||
curated["curation_notes"] = curation_notes
|
||||
|
||||
reasons = review_reasons(curated, candidates)
|
||||
severe = {"no_viable_definition", "proper_noun_collision", "candidate_mentions_answer"}
|
||||
alpha_ready = bool(preferred_definition) and not severe.intersection(reasons)
|
||||
curated["alpha_ready"] = alpha_ready
|
||||
curated["review_reasons"] = reasons
|
||||
|
||||
review_item = None
|
||||
if reasons:
|
||||
review_item = {
|
||||
"form": curated.get("form"),
|
||||
"lemma": curated.get("lemma"),
|
||||
"pos": curated.get("pos"),
|
||||
"topics": curated.get("topics"),
|
||||
"topic_suggestions": curated.get("topic_suggestions"),
|
||||
"preferred_definition": preferred_definition,
|
||||
"preferred_source": preferred_source,
|
||||
"clue_definitions": clue_definitions,
|
||||
"review_reasons": reasons,
|
||||
"semantic_glosses": (curated.get("semantic") or {}).get("glosses", []),
|
||||
"senses": curated.get("senses", []),
|
||||
"babelnet_status": (curated.get("babelnet") or {}).get("status"),
|
||||
"babelnet_best_synset": (curated.get("babelnet") or {}).get("best_synset"),
|
||||
"wiktextract_status": (curated.get("wiktextract") or {}).get("status"),
|
||||
"wiktextract": curated.get("wiktextract"),
|
||||
"candidate_pool": [
|
||||
{
|
||||
"text": candidate["text"],
|
||||
"source": candidate["source"],
|
||||
"family": candidate["family"],
|
||||
"confidence": candidate["confidence"],
|
||||
"priority": candidate["priority"],
|
||||
}
|
||||
for candidate in candidates[:12]
|
||||
],
|
||||
}
|
||||
|
||||
return curated, review_item
|
||||
|
||||
|
||||
def build_curated_lexicon(args: argparse.Namespace) -> Tuple[Dict[str, object], Dict[str, object]]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
curated_entries: List[Dict[str, object]] = []
|
||||
review_entries: List[Dict[str, object]] = []
|
||||
|
||||
for entry in payload.get("entries", []) or []:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
curated, review_item = curate_entry(entry)
|
||||
curated_entries.append(curated)
|
||||
if review_item:
|
||||
review_entries.append(review_item)
|
||||
|
||||
if args.max_review > 0:
|
||||
review_entries = review_entries[: args.max_review]
|
||||
|
||||
curated_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(curated_entries),
|
||||
"alpha_ready_count": sum(1 for item in curated_entries if item.get("alpha_ready")),
|
||||
"review_count": len(review_entries),
|
||||
},
|
||||
"entries": curated_entries,
|
||||
}
|
||||
|
||||
review_payload = {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(review_entries),
|
||||
},
|
||||
"entries": review_entries,
|
||||
}
|
||||
|
||||
return curated_payload, review_payload
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
curated_payload, review_payload = build_curated_lexicon(args)
|
||||
write_json(args.output, curated_payload)
|
||||
write_json(args.review_output, review_payload)
|
||||
print(f"Lessico curated generato: {args.output}")
|
||||
print(f"Voci totali: {curated_payload['meta']['entry_count']}")
|
||||
print(f"Voci alpha_ready: {curated_payload['meta']['alpha_ready_count']}")
|
||||
print(f"Voci da revisionare: {review_payload['meta']['entry_count']}")
|
||||
print(f"File review generato: {args.review_output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
492
enrich_review_from_wiktextract_file.py
Normal file
492
enrich_review_from_wiktextract_file.py
Normal file
@@ -0,0 +1,492 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
WIKTEXTRACT_INPUT_PATH = Path(__file__).with_name("raw-wiktextract-data.jsonl")
|
||||
WIKTEXTRACT_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktextract.json")
|
||||
WIKTEXTRACT_INDEX_CACHE_PATH = Path(__file__).with_name(".wiktextract_it_index.json")
|
||||
|
||||
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
|
||||
|
||||
POS_MAP = {
|
||||
"noun": "NOUN",
|
||||
"adj": "ADJ",
|
||||
"adj": "ADJ",
|
||||
"verb": "VERB",
|
||||
"adv": "ADV",
|
||||
"prep": "PREP",
|
||||
"conj": "CONJ",
|
||||
"pron": "PRON",
|
||||
"intj": "INTJ",
|
||||
}
|
||||
|
||||
TOPIC_MAP = {
|
||||
"christianity": "religion",
|
||||
"religion": "religion",
|
||||
"history": "history",
|
||||
"agriculture": "agriculture",
|
||||
"engineering": "technology",
|
||||
"mechanics": "technology",
|
||||
"technology": "technology",
|
||||
"medicine": "health",
|
||||
"geography": "geography",
|
||||
"biology": "nature",
|
||||
"aeronautics": "transport",
|
||||
}
|
||||
|
||||
CATEGORY_TOPIC_HINTS = {
|
||||
"religione-it": "religion",
|
||||
"cristianesimo-it": "religion",
|
||||
"storia-it": "history",
|
||||
"agricoltura-it": "agriculture",
|
||||
"medicina-it": "health",
|
||||
"ingegneria-it": "technology",
|
||||
"meccanica-it": "technology",
|
||||
"tecnologia-it": "technology",
|
||||
"geografia-it": "geography",
|
||||
"biologia-it": "nature",
|
||||
"aeronautica-it": "transport",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce il lessico refined leggendo offline il file raw-wiktextract-data.jsonl, "
|
||||
"senza effettuare richieste di rete."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json da usare per selezionare i lemmi prioritari.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--wiktextract",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_INPUT_PATH,
|
||||
help="File JSONL raw estratto da Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_OUTPUT_PATH,
|
||||
help="Lessico refined con blocco wiktextract aggiunto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--index-cache",
|
||||
type=Path,
|
||||
default=WIKTEXTRACT_INDEX_CACHE_PATH,
|
||||
help="Cache dell'indice lemmi->righe del JSONL per velocizzare i rilanci.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
default="",
|
||||
help="Lista separata da virgole di lemmi specifici da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-reasons",
|
||||
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
|
||||
help="Motivi del file review da trattare con priorita, separati da virgole.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci che nel lessico di input hanno gia un blocco wiktextract utile.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_csv_set(value: str) -> set[str]:
|
||||
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def load_or_build_index(jsonl_path: Path, index_cache_path: Path) -> Dict[str, List[int]]:
|
||||
cached = load_json(index_cache_path, {})
|
||||
expected_meta = {
|
||||
"source": str(jsonl_path.resolve()),
|
||||
"size": jsonl_path.stat().st_size if jsonl_path.exists() else 0,
|
||||
"mtime": jsonl_path.stat().st_mtime if jsonl_path.exists() else 0,
|
||||
}
|
||||
if (
|
||||
isinstance(cached, dict)
|
||||
and cached.get("meta") == expected_meta
|
||||
and isinstance(cached.get("index"), dict)
|
||||
):
|
||||
return {str(key): list(value) for key, value in cached["index"].items()}
|
||||
|
||||
index: Dict[str, List[int]] = {}
|
||||
with jsonl_path.open("r", encoding="utf-8") as handle:
|
||||
while True:
|
||||
offset = handle.tell()
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
break
|
||||
raw = line.rstrip("\n")
|
||||
if not raw:
|
||||
continue
|
||||
obj = json.loads(raw)
|
||||
if obj.get("lang_code") != "it":
|
||||
continue
|
||||
word = str(obj.get("word", "")).strip().lower()
|
||||
if word:
|
||||
index.setdefault(word, []).append(offset)
|
||||
|
||||
write_json(index_cache_path, {"meta": expected_meta, "index": index})
|
||||
return index
|
||||
|
||||
|
||||
def read_jsonl_objects_at_offsets(jsonl_path: Path, offsets: Sequence[int]) -> List[Dict[str, object]]:
|
||||
objects: List[Dict[str, object]] = []
|
||||
with jsonl_path.open("r", encoding="utf-8") as handle:
|
||||
for offset in offsets:
|
||||
handle.seek(offset)
|
||||
line = handle.readline()
|
||||
if not line:
|
||||
continue
|
||||
objects.append(json.loads(line))
|
||||
return objects
|
||||
|
||||
|
||||
def map_pos(value: str) -> str:
|
||||
normalized = str(value or "").strip().lower()
|
||||
return POS_MAP.get(normalized, normalized.upper() if normalized else "")
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
value = str(text or "").strip()
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value
|
||||
|
||||
|
||||
def sense_topics(sense: Dict[str, object], categories: Sequence[str]) -> List[str]:
|
||||
topics = set()
|
||||
for topic in sense.get("topics", []) or []:
|
||||
normalized = TOPIC_MAP.get(str(topic).strip().lower())
|
||||
if normalized:
|
||||
topics.add(normalized)
|
||||
for category in categories:
|
||||
normalized = CATEGORY_TOPIC_HINTS.get(str(category).strip().lower())
|
||||
if normalized:
|
||||
topics.add(normalized)
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def word_level_topics(entries: Sequence[Dict[str, object]], categories: Sequence[str]) -> List[str]:
|
||||
topics = set()
|
||||
for entry in entries:
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if isinstance(sense, dict):
|
||||
topics.update(sense_topics(sense, categories))
|
||||
return sorted(topics)
|
||||
|
||||
|
||||
def grammar_hints(entries: Sequence[Dict[str, object]]) -> List[str]:
|
||||
hints = set()
|
||||
for entry in entries:
|
||||
pos = str(entry.get("pos", "")).lower()
|
||||
tags = [str(tag).lower() for tag in entry.get("tags", []) or []]
|
||||
if pos == "verb" and "form-of" in tags:
|
||||
hints.add("voce_verbale")
|
||||
if pos == "noun":
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for gloss in sense.get("glosses", []) or []:
|
||||
gloss_text = str(gloss).lower()
|
||||
if "diminutivo" in gloss_text:
|
||||
hints.add("diminutivo")
|
||||
if "accrescitivo" in gloss_text:
|
||||
hints.add("accrescitivo")
|
||||
if "peggiorativo" in gloss_text:
|
||||
hints.add("peggiorativo")
|
||||
for sense in entry.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
for gloss in sense.get("glosses", []) or []:
|
||||
gloss_text = str(gloss).lower()
|
||||
if "congiuntivo" in gloss_text:
|
||||
hints.add("congiuntivo")
|
||||
if "imperativo" in gloss_text:
|
||||
hints.add("imperativo")
|
||||
if "plurale" in gloss_text:
|
||||
hints.add("plurale")
|
||||
return sorted(hints)
|
||||
|
||||
|
||||
def simplify_entry(obj: Dict[str, object]) -> Dict[str, object]:
|
||||
categories = [normalize_text(item) for item in obj.get("categories", []) or [] if item]
|
||||
senses = []
|
||||
for sense in obj.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
glosses = [normalize_text(item) for item in sense.get("glosses", []) or [] if normalize_text(item)]
|
||||
if not glosses:
|
||||
continue
|
||||
senses.append(
|
||||
{
|
||||
"glosses": glosses,
|
||||
"examples": [
|
||||
normalize_text(example.get("text", ""))
|
||||
for example in sense.get("examples", []) or []
|
||||
if isinstance(example, dict) and normalize_text(example.get("text", ""))
|
||||
],
|
||||
"topics": sense_topics(sense, categories),
|
||||
"tags": [str(tag) for tag in sense.get("tags", []) or [] if tag],
|
||||
"categories": [normalize_text(item) for item in sense.get("categories", []) or [] if item],
|
||||
}
|
||||
)
|
||||
return {
|
||||
"word": obj.get("word"),
|
||||
"lang": obj.get("lang"),
|
||||
"lang_code": obj.get("lang_code"),
|
||||
"pos": map_pos(str(obj.get("pos", ""))),
|
||||
"pos_title": obj.get("pos_title"),
|
||||
"tags": [str(tag) for tag in obj.get("tags", []) or [] if tag],
|
||||
"categories": categories,
|
||||
"senses": senses,
|
||||
"synonyms": [item for item in obj.get("synonyms", []) or [] if isinstance(item, dict) and item.get("word")],
|
||||
"related": [item for item in obj.get("related", []) or [] if isinstance(item, dict) and item.get("word")],
|
||||
}
|
||||
|
||||
|
||||
def choose_best_entries(refined_entry: Dict[str, object], candidates: Sequence[Dict[str, object]]) -> List[Dict[str, object]]:
|
||||
target_pos = str(refined_entry.get("pos", "")).upper()
|
||||
exact = [candidate for candidate in candidates if str(candidate.get("pos", "")).upper() == target_pos]
|
||||
if exact:
|
||||
return exact
|
||||
return list(candidates)
|
||||
|
||||
|
||||
def wiktextract_already_useful(entry: Dict[str, object]) -> bool:
|
||||
payload = entry.get("wiktextract", {})
|
||||
if not isinstance(payload, dict):
|
||||
return False
|
||||
status = str(payload.get("status", "")).lower()
|
||||
if status == "enriched" and (payload.get("definitions") or payload.get("entries")):
|
||||
return True
|
||||
if status in {"missing", "no_match"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def select_targets(
|
||||
refined_payload: Dict[str, object],
|
||||
review_payload: Dict[str, object],
|
||||
review_reasons: set[str],
|
||||
explicit_words: set[str],
|
||||
word_limit: int,
|
||||
skip_existing: bool,
|
||||
) -> Tuple[List[Dict[str, object]], int]:
|
||||
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
|
||||
|
||||
if explicit_words:
|
||||
selected = []
|
||||
skipped_existing_count = 0
|
||||
for word in explicit_words:
|
||||
entry = refined_by_word.get(word)
|
||||
if entry is None:
|
||||
continue
|
||||
if skip_existing and wiktextract_already_useful(entry):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
selected = selected[:word_limit] if word_limit > 0 else selected
|
||||
return selected, skipped_existing_count
|
||||
|
||||
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
selected_words: List[str] = []
|
||||
seen = set()
|
||||
skipped_existing_count = 0
|
||||
|
||||
for review_entry in review_entries:
|
||||
word = str(review_entry.get("form", "")).strip().lower()
|
||||
if not word or word in seen:
|
||||
continue
|
||||
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
|
||||
refined = refined_by_word.get(word)
|
||||
if refined is None:
|
||||
continue
|
||||
if skip_existing and wiktextract_already_useful(refined):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
|
||||
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
|
||||
selected_words.append(word)
|
||||
seen.add(word)
|
||||
if word_limit > 0 and len(selected_words) >= word_limit:
|
||||
break
|
||||
|
||||
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
|
||||
|
||||
|
||||
def wiktextract_payload_for_entry(refined_entry: Dict[str, object], matches: Sequence[Dict[str, object]]) -> Dict[str, object]:
|
||||
if not matches:
|
||||
return {
|
||||
"status": "missing",
|
||||
"matched": False,
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
}
|
||||
|
||||
selected_entries = choose_best_entries(refined_entry, matches)
|
||||
definitions = []
|
||||
for item in selected_entries:
|
||||
for sense in item.get("senses", []) or []:
|
||||
if not isinstance(sense, dict):
|
||||
continue
|
||||
definitions.extend(sense.get("glosses", []) or [])
|
||||
definitions = [normalize_text(item) for item in definitions if normalize_text(item)]
|
||||
|
||||
all_categories = []
|
||||
for item in selected_entries:
|
||||
all_categories.extend(item.get("categories", []) or [])
|
||||
|
||||
return {
|
||||
"status": "enriched" if definitions else "entries_without_definitions",
|
||||
"matched": bool(definitions),
|
||||
"definitions": definitions,
|
||||
"entries": selected_entries,
|
||||
"topic_hints": word_level_topics(selected_entries, all_categories),
|
||||
"grammar_hints": grammar_hints(selected_entries),
|
||||
"categories": sorted(set(normalize_text(item) for item in all_categories if normalize_text(item))),
|
||||
}
|
||||
|
||||
|
||||
def enrich_from_wiktextract(args: argparse.Namespace) -> Dict[str, object]:
|
||||
refined_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
review_payload = load_json(args.review, {"entries": []})
|
||||
if not isinstance(review_payload, dict):
|
||||
review_payload = {"entries": []}
|
||||
|
||||
targets, skipped_existing_count = select_targets(
|
||||
refined_payload,
|
||||
review_payload,
|
||||
parse_csv_set(args.review_reasons),
|
||||
parse_csv_set(args.words),
|
||||
args.word_limit,
|
||||
args.skip_existing,
|
||||
)
|
||||
|
||||
print(
|
||||
f"Target selezionati: {len(targets)}"
|
||||
+ (f" | già saltati per wiktextract esistente: {skipped_existing_count}" if args.skip_existing else "")
|
||||
)
|
||||
|
||||
index = load_or_build_index(args.wiktextract, args.index_cache)
|
||||
refined_index = {
|
||||
entry_key(entry): deepcopy(entry)
|
||||
for entry in refined_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
|
||||
matched_count = 0
|
||||
missing_count = 0
|
||||
|
||||
for idx, entry in enumerate(targets, start=1):
|
||||
updated = deepcopy(entry)
|
||||
word = str(entry.get("form", "")).strip().lower()
|
||||
offsets = index.get(word, [])
|
||||
objects = [simplify_entry(obj) for obj in read_jsonl_objects_at_offsets(args.wiktextract, offsets)]
|
||||
payload = wiktextract_payload_for_entry(updated, objects)
|
||||
updated["wiktextract"] = payload
|
||||
updated["wiktextract_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
refined_index[entry_key(updated)] = updated
|
||||
|
||||
if payload.get("matched"):
|
||||
matched_count += 1
|
||||
else:
|
||||
missing_count += 1
|
||||
|
||||
print(
|
||||
f"[{idx}/{len(targets)}] {word}: "
|
||||
f"status={payload.get('status')} "
|
||||
f"def={len(payload.get('definitions', []))} "
|
||||
f"topics={len(payload.get('topic_hints', []))} "
|
||||
f"entries={len(payload.get('entries', []))}"
|
||||
)
|
||||
|
||||
merged_entries = list(refined_index.values())
|
||||
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
|
||||
|
||||
merged_payload = {
|
||||
"meta": {
|
||||
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
|
||||
"wiktextract_source": str(args.wiktextract),
|
||||
"wiktextract_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"wiktextract_target_count": len(targets),
|
||||
"wiktextract_skipped_existing_count": skipped_existing_count,
|
||||
"wiktextract_matched_count": matched_count,
|
||||
"wiktextract_missing_count": missing_count,
|
||||
},
|
||||
"entries": merged_entries,
|
||||
}
|
||||
|
||||
write_json(args.output, merged_payload)
|
||||
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"matched_count": matched_count,
|
||||
"missing_count": missing_count,
|
||||
"output": str(args.output),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = enrich_from_wiktextract(args)
|
||||
print(f"Lessico con Wiktextract generato: {result['output']}")
|
||||
print(f"Voci trattate: {result['target_count']}")
|
||||
print(f"Voci già saltate: {result['skipped_existing_count']}")
|
||||
print(f"Match Wiktextract: {result['matched_count']}")
|
||||
print(f"Senza match Wiktextract: {result['missing_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
678
enrich_review_from_wiktionary.py
Normal file
678
enrich_review_from_wiktionary.py
Normal file
@@ -0,0 +1,678 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
WIKTIONARY_CACHE_PATH = Path(__file__).with_name(".wiktionary_cache.json")
|
||||
WIKTIONARY_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktionary.json")
|
||||
WIKTIONARY_API_URL = "https://it.wiktionary.org/w/api.php"
|
||||
|
||||
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
|
||||
|
||||
POS_ALIASES = {
|
||||
"sostantivo": "NOUN",
|
||||
"nome": "NOUN",
|
||||
"sost": "NOUN",
|
||||
"aggettivo": "ADJ",
|
||||
"agg": "ADJ",
|
||||
"verbo": "VERB",
|
||||
"verb": "VERB",
|
||||
"verb form": "VERB_FORM",
|
||||
"avverbio": "ADV",
|
||||
"avv": "ADV",
|
||||
"preposizione": "PREP",
|
||||
"prep": "PREP",
|
||||
"congiunzione": "CONJ",
|
||||
"cong": "CONJ",
|
||||
"pronome": "PRON",
|
||||
"pron": "PRON",
|
||||
"articolo": "ART",
|
||||
"interiezione": "INTJ",
|
||||
"inter": "INTJ",
|
||||
"locuzione": "PHRASE",
|
||||
"loc": "PHRASE",
|
||||
}
|
||||
|
||||
TOPIC_KEYWORDS = {
|
||||
"religion": ("religione", "cattolic", "sacro", "sacra", "devozion", "scapolare", "abbazia", "monastero"),
|
||||
"clothing": ("abito", "vestito", "vestit", "abbigliamento", "indumento", "stoffa"),
|
||||
"grammar": ("diminutivo", "voce verbale", "congiuntivo", "plurale", "singolare", "grammatica", "verbo"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "citta", "localita", "frazione"),
|
||||
"transport": ("veicolo", "motore", "treno", "aereo", "trasporto", "nave", "imbarcazione"),
|
||||
"health": ("medicina", "ospedale", "malattia", "cura", "feriti", "ammalati", "sanitario"),
|
||||
}
|
||||
|
||||
GRAMMAR_KEYWORDS = (
|
||||
"diminutivo",
|
||||
"accrescitivo",
|
||||
"peggiorativo",
|
||||
"alterato",
|
||||
"voce verbale",
|
||||
"congiuntivo",
|
||||
"participio",
|
||||
"plurale",
|
||||
"singolare",
|
||||
"maschile",
|
||||
"femminile",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce le voci problematiche del lessico refined con definizioni e metadati "
|
||||
"estratti da it.wiktionary.org."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json da usare per selezionare le voci prioritarie.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=WIKTIONARY_OUTPUT_PATH,
|
||||
help="Nuovo lessico con blocco wiktionary aggiunto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache",
|
||||
type=Path,
|
||||
default=WIKTIONARY_CACHE_PATH,
|
||||
help="Cache locale delle risposte Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Pausa tra le richieste HTTP a Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-every",
|
||||
type=int,
|
||||
default=25,
|
||||
help="Salva cache e output ogni N parole elaborate per non perdere progresso.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-429",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Numero massimo di tentativi aggiuntivi se Wiktionary risponde HTTP 429.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backoff-429",
|
||||
type=float,
|
||||
default=30.0,
|
||||
help="Secondi di attesa iniziali dopo un HTTP 429; raddoppiano a ogni nuovo tentativo.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-on-429",
|
||||
action="store_true",
|
||||
help="Se attivo, al primo HTTP 429 salva lo stato e interrompe il batch senza altri tentativi.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
default="",
|
||||
help="Lista separata da virgole di lemmi specifici da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-reasons",
|
||||
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
|
||||
help="Motivi del file review da trattare con priorita, separati da virgole.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
default=WIKTIONARY_API_URL,
|
||||
help="Endpoint MediaWiki Action API di Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci che nel lessico di input hanno già un blocco wiktionary con stato utile.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_csv_set(value: str) -> set[str]:
|
||||
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def fetch_wikitext(title: str, api_url: str) -> Dict[str, object]:
|
||||
params = {
|
||||
"action": "query",
|
||||
"prop": "revisions",
|
||||
"titles": title,
|
||||
"rvprop": "content",
|
||||
"rvslots": "main",
|
||||
"formatversion": "2",
|
||||
"format": "json",
|
||||
}
|
||||
url = f"{api_url}?{urllib.parse.urlencode(params)}"
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "cruciverba-alpha/0.1 (local lexical enrichment)",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
pages = ((payload.get("query") or {}).get("pages") or [])
|
||||
if not pages:
|
||||
return {"status": "missing"}
|
||||
page = pages[0]
|
||||
if page.get("missing"):
|
||||
return {"status": "missing", "title": page.get("title", title)}
|
||||
revisions = page.get("revisions") or []
|
||||
content = ""
|
||||
if revisions:
|
||||
slots = revisions[0].get("slots") or {}
|
||||
main_slot = slots.get("main") or {}
|
||||
content = str(main_slot.get("content") or "")
|
||||
return {
|
||||
"status": "ok" if content else "empty",
|
||||
"title": page.get("title", title),
|
||||
"pageid": page.get("pageid"),
|
||||
"wikitext": content,
|
||||
}
|
||||
|
||||
|
||||
def fetch_wikitext_with_retry(title: str, args: argparse.Namespace) -> Dict[str, object]:
|
||||
attempts = 0
|
||||
delay = max(1.0, float(args.backoff_429))
|
||||
while True:
|
||||
try:
|
||||
return fetch_wikitext(title, args.api_url)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code != 429:
|
||||
raise
|
||||
if args.stop_on_429:
|
||||
raise
|
||||
if attempts >= max(0, int(args.retry_429)):
|
||||
raise
|
||||
attempts += 1
|
||||
print(f"[429] {title}: attendo {delay:.1f}s prima del tentativo {attempts}/{args.retry_429}")
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
|
||||
|
||||
def normalize_heading(text: str) -> str:
|
||||
raw = str(text or "").strip().lower().replace(" ", "")
|
||||
if raw == "{{-it-}}":
|
||||
return "{{-it-}}"
|
||||
cleaned = strip_wikicode(text).strip().lower()
|
||||
return cleaned
|
||||
|
||||
|
||||
def extract_italian_section(wikitext: str) -> str:
|
||||
section_pattern = re.compile(r"^==\s*(.*?)\s*==\s*$", re.MULTILINE)
|
||||
matches = list(section_pattern.finditer(wikitext))
|
||||
for index, match in enumerate(matches):
|
||||
raw_heading = str(match.group(1) or "").strip().lower().replace(" ", "")
|
||||
heading = normalize_heading(match.group(1))
|
||||
if raw_heading == "{{-it-}}" or heading in {"italiano", "it"}:
|
||||
start = match.end()
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(wikitext)
|
||||
return wikitext[start:end]
|
||||
return ""
|
||||
|
||||
|
||||
def strip_templates(text: str) -> str:
|
||||
previous = None
|
||||
current = text
|
||||
while previous != current:
|
||||
previous = current
|
||||
current = re.sub(r"\{\{([^{}|]+)\|([^{}]+?)\}\}", r"\2", current)
|
||||
current = re.sub(r"\{\{[^{}]+\}\}", "", current)
|
||||
return current
|
||||
|
||||
|
||||
def strip_wikicode(text: str) -> str:
|
||||
value = str(text or "")
|
||||
value = re.sub(r"<!--.*?-->", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<ref[^>]*>.*?</ref>", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<[^>]+>", " ", value)
|
||||
value = strip_templates(value)
|
||||
value = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", value)
|
||||
value = re.sub(r"\[\[([^\]]+)\]\]", r"\1", value)
|
||||
value = value.replace("'''", "").replace("''", "")
|
||||
value = value.replace(" ", " ")
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value.strip(" .;:-")
|
||||
|
||||
|
||||
def infer_topics(definitions: Sequence[str], categories: Sequence[str]) -> List[str]:
|
||||
text = " ".join(definitions + list(categories)).lower()
|
||||
topics = []
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
if any(keyword in text for keyword in keywords):
|
||||
topics.append(topic)
|
||||
return sorted(set(topics))
|
||||
|
||||
|
||||
def infer_grammar_hints(definitions: Sequence[str], raw_section: str) -> List[str]:
|
||||
text = f"{' '.join(definitions)} {raw_section}".lower()
|
||||
hints = []
|
||||
for keyword in GRAMMAR_KEYWORDS:
|
||||
if keyword in text:
|
||||
hints.append(keyword)
|
||||
return sorted(set(hints))
|
||||
|
||||
|
||||
def detect_pos_from_heading(heading: str) -> Optional[str]:
|
||||
normalized = normalize_heading(heading)
|
||||
if not normalized:
|
||||
return None
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if label in normalized:
|
||||
return pos
|
||||
return None
|
||||
|
||||
|
||||
def parse_template_marker(line: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
stripped = line.strip()
|
||||
match = re.match(r"^\{\{-([^{}|]+?)-?(?:\|.*)?\}\}$", stripped, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
return None, None
|
||||
marker = match.group(1).strip().lower()
|
||||
if marker == "it":
|
||||
return "language", "it"
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if marker.startswith(label):
|
||||
return "pos", pos
|
||||
if marker.startswith("sinon"):
|
||||
return "subsection", "sinonimi"
|
||||
if marker.startswith(("etim", "trad", "sill", "pron", "var", "note")):
|
||||
return "subsection", marker
|
||||
return "subsection", marker
|
||||
|
||||
|
||||
def parse_wiktionary_section(section_text: str) -> Dict[str, object]:
|
||||
lines = section_text.splitlines()
|
||||
entries: List[Dict[str, object]] = []
|
||||
categories: List[str] = []
|
||||
current: Optional[Dict[str, object]] = None
|
||||
current_subsection = ""
|
||||
|
||||
heading_pattern = re.compile(r"^(={3,4})\s*(.*?)\s*\1\s*$")
|
||||
|
||||
for raw_line in lines:
|
||||
line = raw_line.rstrip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
for category_match in re.findall(r"\[\[Categoria:([^\]]+)\]\]", line):
|
||||
categories.append(strip_wikicode(category_match))
|
||||
|
||||
marker_kind, marker_value = parse_template_marker(line)
|
||||
if marker_kind == "pos":
|
||||
current = {
|
||||
"pos": marker_value,
|
||||
"heading": marker_value,
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
if marker_kind == "subsection":
|
||||
current_subsection = str(marker_value or "")
|
||||
continue
|
||||
|
||||
heading_match = heading_pattern.match(line)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
heading = heading_match.group(2)
|
||||
if level == 3:
|
||||
pos = detect_pos_from_heading(heading)
|
||||
if pos:
|
||||
current = {
|
||||
"pos": pos,
|
||||
"heading": strip_wikicode(heading),
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
current_subsection = normalize_heading(heading)
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
continue
|
||||
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("#") and not stripped.startswith(("#:", "#*", "#;")):
|
||||
definition = strip_wikicode(stripped.lstrip("#").strip())
|
||||
if definition:
|
||||
current["definitions"].append(definition)
|
||||
continue
|
||||
|
||||
if stripped.startswith("#:") or stripped.startswith("#*"):
|
||||
example = strip_wikicode(stripped[2:].strip())
|
||||
if example:
|
||||
current["examples"].append(example)
|
||||
continue
|
||||
|
||||
if current_subsection.startswith("sinonim") and stripped.startswith("*"):
|
||||
synonym = strip_wikicode(stripped.lstrip("*").strip())
|
||||
if synonym:
|
||||
current["synonyms"].append(synonym)
|
||||
|
||||
flat_definitions = [definition for entry in entries for definition in entry["definitions"]]
|
||||
topic_hints = infer_topics(flat_definitions, categories)
|
||||
grammar_hints = infer_grammar_hints(flat_definitions, section_text)
|
||||
|
||||
return {
|
||||
"entries": entries,
|
||||
"categories": sorted(set(filter(None, categories))),
|
||||
"definitions": flat_definitions,
|
||||
"topic_hints": topic_hints,
|
||||
"grammar_hints": grammar_hints,
|
||||
}
|
||||
|
||||
|
||||
def wiktionary_payload_for_entry(entry: Dict[str, object], api_response: Dict[str, object]) -> Dict[str, object]:
|
||||
status = str(api_response.get("status", "missing"))
|
||||
if status != "ok":
|
||||
return {
|
||||
"status": status,
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
italian_section = extract_italian_section(str(api_response.get("wikitext") or ""))
|
||||
if not italian_section:
|
||||
return {
|
||||
"status": "no_italian_section",
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
parsed = parse_wiktionary_section(italian_section)
|
||||
matched = bool(parsed["definitions"])
|
||||
|
||||
return {
|
||||
"status": "enriched" if matched else "section_without_definitions",
|
||||
"matched": matched,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"pageid": api_response.get("pageid"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": parsed["definitions"],
|
||||
"entries": parsed["entries"],
|
||||
"topic_hints": parsed["topic_hints"],
|
||||
"grammar_hints": parsed["grammar_hints"],
|
||||
"categories": parsed["categories"],
|
||||
"raw_excerpt": italian_section[:4000],
|
||||
}
|
||||
|
||||
|
||||
def select_targets(
|
||||
refined_payload: Dict[str, object],
|
||||
review_payload: Dict[str, object],
|
||||
review_reasons: set[str],
|
||||
explicit_words: set[str],
|
||||
word_limit: int,
|
||||
skip_existing: bool,
|
||||
) -> Tuple[List[Dict[str, object]], int]:
|
||||
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
|
||||
|
||||
if explicit_words:
|
||||
selected = []
|
||||
skipped_existing_count = 0
|
||||
for word in explicit_words:
|
||||
entry = refined_by_word.get(word)
|
||||
if entry is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(entry):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
selected = selected[:word_limit] if word_limit > 0 else selected
|
||||
return selected, skipped_existing_count
|
||||
|
||||
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
selected_words: List[str] = []
|
||||
seen = set()
|
||||
skipped_existing_count = 0
|
||||
|
||||
for review_entry in review_entries:
|
||||
word = str(review_entry.get("form", "")).strip().lower()
|
||||
if not word or word in seen:
|
||||
continue
|
||||
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
|
||||
refined = refined_by_word.get(word)
|
||||
if refined is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(refined):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
|
||||
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
|
||||
selected_words.append(word)
|
||||
seen.add(word)
|
||||
if word_limit > 0 and len(selected_words) >= word_limit:
|
||||
break
|
||||
|
||||
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
|
||||
|
||||
|
||||
def wiktionary_already_useful(entry: Dict[str, object]) -> bool:
|
||||
wiktionary = entry.get("wiktionary", {})
|
||||
if not isinstance(wiktionary, dict):
|
||||
return False
|
||||
status = str(wiktionary.get("status", "")).lower()
|
||||
if status == "enriched" and (wiktionary.get("definitions") or wiktionary.get("entries")):
|
||||
return True
|
||||
if status in {"missing", "no_italian_section", "section_without_definitions", "empty"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def enrich_from_wiktionary(args: argparse.Namespace) -> Dict[str, object]:
|
||||
refined_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
review_payload = load_json(args.review, {"entries": []})
|
||||
if not isinstance(review_payload, dict):
|
||||
review_payload = {"entries": []}
|
||||
|
||||
cache = load_json(args.cache, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
|
||||
targets, skipped_existing_count = select_targets(
|
||||
refined_payload,
|
||||
review_payload,
|
||||
parse_csv_set(args.review_reasons),
|
||||
parse_csv_set(args.words),
|
||||
args.word_limit,
|
||||
args.skip_existing,
|
||||
)
|
||||
|
||||
enriched_entries = []
|
||||
cache_hits = 0
|
||||
network_calls = 0
|
||||
network_attempts = 0
|
||||
processed_count = 0
|
||||
stopped_reason = None
|
||||
stop_word = None
|
||||
|
||||
print(
|
||||
f"Target selezionati: {len(targets)}"
|
||||
+ (f" | già saltati per wiktionary esistente: {skipped_existing_count}" if args.skip_existing else "")
|
||||
)
|
||||
|
||||
def persist_progress() -> None:
|
||||
refined_index = {
|
||||
entry_key(entry): entry
|
||||
for entry in refined_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
for item in enriched_entries:
|
||||
refined_index[entry_key(item)] = item
|
||||
|
||||
merged_entries = list(refined_index.values())
|
||||
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
|
||||
|
||||
merged_payload = {
|
||||
"meta": {
|
||||
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
|
||||
"wiktionary_source": args.api_url,
|
||||
"wiktionary_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"wiktionary_target_count": len(targets),
|
||||
"wiktionary_processed_count": processed_count,
|
||||
"wiktionary_skipped_existing_count": skipped_existing_count,
|
||||
"wiktionary_cache_hits": cache_hits,
|
||||
"wiktionary_network_calls": network_calls,
|
||||
"wiktionary_network_attempts": network_attempts,
|
||||
"wiktionary_stopped_reason": stopped_reason,
|
||||
"wiktionary_stop_word": stop_word,
|
||||
},
|
||||
"entries": merged_entries,
|
||||
}
|
||||
|
||||
write_json(args.cache, cache)
|
||||
write_json(args.output, merged_payload)
|
||||
|
||||
for index, entry in enumerate(targets, start=1):
|
||||
updated = deepcopy(entry)
|
||||
word = str(entry.get("form", "")).strip()
|
||||
cache_key = word.lower()
|
||||
|
||||
if cache_key in cache:
|
||||
api_response = cache[cache_key]
|
||||
cache_hits += 1
|
||||
else:
|
||||
try:
|
||||
network_attempts += 1
|
||||
api_response = fetch_wikitext_with_retry(word, args)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 429:
|
||||
stop_word = word
|
||||
stopped_reason = f"http_429_after_{processed_count}_words"
|
||||
print(f"[STOP] Wiktionary ha risposto 429 su '{word}'. Salvo il progresso e interrompo il batch.")
|
||||
persist_progress()
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
raise
|
||||
cache[cache_key] = api_response
|
||||
network_calls += 1
|
||||
if args.sleep > 0:
|
||||
time.sleep(args.sleep)
|
||||
|
||||
updated["wiktionary"] = wiktionary_payload_for_entry(updated, api_response)
|
||||
updated["wiktionary_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
enriched_entries.append(updated)
|
||||
processed_count += 1
|
||||
print(
|
||||
f"[{index}/{len(targets)}] {word}: "
|
||||
f"status={updated['wiktionary'].get('status')} "
|
||||
f"def={len(updated['wiktionary'].get('definitions', []))} "
|
||||
f"topics={len(updated['wiktionary'].get('topic_hints', []))}"
|
||||
)
|
||||
if args.save_every > 0 and processed_count % int(args.save_every) == 0:
|
||||
persist_progress()
|
||||
print(f"[save] progresso salvato dopo {processed_count} parole")
|
||||
|
||||
persist_progress()
|
||||
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = enrich_from_wiktionary(args)
|
||||
print(f"Lessico con Wiktionary generato: {result['output']}")
|
||||
print(f"Voci trattate: {result.get('processed_count', result['target_count'])}/{result['target_count']}")
|
||||
if "skipped_existing_count" in result:
|
||||
print(f"Voci già saltate: {result['skipped_existing_count']}")
|
||||
print(f"Cache hit: {result['cache_hits']}")
|
||||
print(f"Chiamate rete: {result['network_calls']}")
|
||||
if "network_attempts" in result:
|
||||
print(f"Tentativi di rete: {result['network_attempts']}")
|
||||
if result.get("stopped_reason"):
|
||||
print(f"Batch interrotto: {result['stopped_reason']}")
|
||||
if result.get("stop_word"):
|
||||
print(f"Ultima parola bloccante: {result['stop_word']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
italian-words-dict-3.4.0.tgz
Normal file
BIN
italian-words-dict-3.4.0.tgz
Normal file
Binary file not shown.
BIN
iwn-omw-main.zip
Normal file
BIN
iwn-omw-main.zip
Normal file
Binary file not shown.
21
iwn-omw-main/IWN-OMW-main/README.md
Normal file
21
iwn-omw-main/IWN-OMW-main/README.md
Normal file
@@ -0,0 +1,21 @@
|
||||
# IWN-OMW
|
||||
This is the repository for the Open Italian WordNet, i.e. ItalWordNet versions compliant with the Open Multilingual WordNet guidelines and initiative.
|
||||
|
||||
IWN-OMW is a new LMF version of the ItalWordNet resource converted and formatted according to the guidelines and requirements defined by the Open Multilingual Wordnet initiative (OMW, https://omwn.org/). This current version is derived from the ItalWordNet v.2. (IWN) database (http://hdl.handle.net/20.500.11752/ILC-62).
|
||||
|
||||
NB: 'dc:relation', when used, contains links to equivalent Senses in the RDF version of the SIMPLE Italian lexiconù
|
||||
|
||||
## Licence
|
||||
|
||||
CC-BY-SA 4.0
|
||||
|
||||
## Citation
|
||||
|
||||
If you use this resource please cite:
|
||||
|
||||
Quochi, Valeria, Roberto Bartolini, and Monica Monachini (to appear) ‘ItalwordNet goes open´. *LiLT Special Issues on Open Multilingual
|
||||
WordNets*. CSLI Publications.
|
||||
|
||||
And
|
||||
|
||||
Roventini, Adriana, Antonietta Alonge, Francesca Bertagna, Nicoletta Calzolari, J. Cancila, C. Girardi, Bernardo Magnini, Rita Marinelli, Manuela Speranza, and Antonio Zampolli (2003) "ItalwordNet: building a large semantic database for the automatic treatment of Italian". *Linguistica Computazionale* 18-19:745-791.
|
||||
487155
iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
Normal file
487155
iwn-omw-main/IWN-OMW-main/data/LMF-XML/IWN-OMW_LMF_v1.0.xml
Normal file
File diff suppressed because it is too large
Load Diff
432453
lexicon_it.json
Normal file
432453
lexicon_it.json
Normal file
File diff suppressed because it is too large
Load Diff
1301919
lexicon_it_semantic.json
Normal file
1301919
lexicon_it_semantic.json
Normal file
File diff suppressed because it is too large
Load Diff
203
package/LICENSE
Normal file
203
package/LICENSE
Normal file
@@ -0,0 +1,203 @@
|
||||
Copyright 2019 Ludan Stoecklé
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
17
package/README.md
Normal file
17
package/README.md
Normal file
@@ -0,0 +1,17 @@
|
||||
<!--
|
||||
Copyright 2019 Ludan Stoecklé
|
||||
SPDX-License-Identifier: CC-BY-4.0
|
||||
-->
|
||||
# Italian Word Dict
|
||||
|
||||
List of Italian words.
|
||||
|
||||
It is based on [morph-it](https://docs.sslmit.unibo.it/doku.php?id=resources:morph-it) which provides an extensive morphological resource for the Italian language.
|
||||
|
||||
You can use `italian-words` to use this resource properly.
|
||||
|
||||
|
||||
## dependencies and licences
|
||||
|
||||
[morph-it](https://docs.sslmit.unibo.it/doku.php?id=resources:morph-it) provides an extensive morphological resource for the Italian language. It is dual-licensed free software and can be redistributed it and/or modified under the terms of the under the Creative Commons Attribution ShareAlike 2.0 License and the GNU Lesser General Public License.
|
||||
The derived file `words.json` remains under the same licence.
|
||||
85
package/dist/CC-BY-SA-2.0.txt
vendored
Normal file
85
package/dist/CC-BY-SA-2.0.txt
vendored
Normal file
@@ -0,0 +1,85 @@
|
||||
Creative Commons Attribution-ShareAlike 2.0
|
||||
|
||||
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE.
|
||||
|
||||
License
|
||||
|
||||
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
|
||||
|
||||
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND CONDITIONS.
|
||||
|
||||
1. Definitions
|
||||
|
||||
a. "Collective Work" means a work, such as a periodical issue, anthology or encyclopedia, in which the Work in its entirety in unmodified form, along with a number of other contributions, constituting separate and independent works in themselves, are assembled into a collective whole. A work that constitutes a Collective Work will not be considered a Derivative Work (as defined below) for the purposes of this License.
|
||||
|
||||
b. "Derivative Work" means a work based upon the Work or upon the Work and other pre-existing works, such as a translation, musical arrangement, dramatization, fictionalization, motion picture version, sound recording, art reproduction, abridgment, condensation, or any other form in which the Work may be recast, transformed, or adapted, except that a work that constitutes a Collective Work will not be considered a Derivative Work for the purpose of this License. For the avoidance of doubt, where the Work is a musical composition or sound recording, the synchronization of the Work in timed-relation with a moving image ("synching") will be considered a Derivative Work for the purpose of this License.
|
||||
|
||||
c. "Licensor" means the individual or entity that offers the Work under the terms of this License.
|
||||
|
||||
d. "Original Author" means the individual or entity who created the Work.
|
||||
|
||||
e. "Work" means the copyrightable work of authorship offered under the terms of this License.
|
||||
|
||||
f. "You" means an individual or entity exercising rights under this License who has not previously violated the terms of this License with respect to the Work, or who has received express permission from the Licensor to exercise rights under this License despite a previous violation.
|
||||
|
||||
g. "License Elements" means the following high-level license attributes as selected by Licensor and indicated in the title of this License: Attribution, ShareAlike.
|
||||
|
||||
2. Fair Use Rights. Nothing in this license is intended to reduce, limit, or restrict any rights arising from fair use, first sale or other limitations on the exclusive rights of the copyright owner under copyright law or other applicable laws.
|
||||
|
||||
3. License Grant. Subject to the terms and conditions of this License, Licensor hereby grants You a worldwide, royalty-free, non-exclusive, perpetual (for the duration of the applicable copyright) license to exercise the rights in the Work as stated below:
|
||||
|
||||
a. to reproduce the Work, to incorporate the Work into one or more Collective Works, and to reproduce the Work as incorporated in the Collective Works;
|
||||
|
||||
b. to create and reproduce Derivative Works;
|
||||
|
||||
c. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission the Work including as incorporated in Collective Works;
|
||||
|
||||
d. to distribute copies or phonorecords of, display publicly, perform publicly, and perform publicly by means of a digital audio transmission Derivative Works.
|
||||
|
||||
e. For the avoidance of doubt, where the work is a musical composition:
|
||||
|
||||
i. Performance Royalties Under Blanket Licenses. Licensor waives the exclusive right to collect, whether individually or via a performance rights society (e.g. ASCAP, BMI, SESAC), royalties for the public performance or public digital performance (e.g. webcast) of the Work.
|
||||
|
||||
ii. Mechanical Rights and Statutory Royalties. Licensor waives the exclusive right to collect, whether individually or via a music rights society or designated agent (e.g. Harry Fox Agency), royalties for any phonorecord You create from the Work ("cover version") and distribute, subject to the compulsory license created by 17 USC Section 115 of the US Copyright Act (or the equivalent in other jurisdictions).
|
||||
|
||||
f. Webcasting Rights and Statutory Royalties. For the avoidance of doubt, where the Work is a sound recording, Licensor waives the exclusive right to collect, whether individually or via a performance-rights society (e.g. SoundExchange), royalties for the public digital performance (e.g. webcast) of the Work, subject to the compulsory license created by 17 USC Section 114 of the US Copyright Act (or the equivalent in other jurisdictions).
|
||||
|
||||
The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by Licensor are hereby reserved.
|
||||
|
||||
4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
|
||||
|
||||
a. You may distribute, publicly display, publicly perform, or publicly digitally perform the Work only under the terms of this License, and You must include a copy of, or the Uniform Resource Identifier for, this License with every copy or phonorecord of the Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Work that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder. You may not sublicense the Work. You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Work itself to be made subject to the terms of this License. If You create a Collective Work, upon notice from any Licensor You must, to the extent practicable, remove from the Collective Work any reference to such Licensor or the Original Author, as requested. If You create a Derivative Work, upon notice from any Licensor You must, to the extent practicable, remove from the Derivative Work any reference to such Licensor or the Original Author, as requested.
|
||||
|
||||
b. You may distribute, publicly display, publicly perform, or publicly digitally perform a Derivative Work only under the terms of this License, a later version of this License with the same License Elements as this License, or a Creative Commons iCommons license that contains the same License Elements as this License (e.g. Attribution-ShareAlike 2.0 Japan). You must include a copy of, or the Uniform Resource Identifier for, this License or other license specified in the previous sentence with every copy or phonorecord of each Derivative Work You distribute, publicly display, publicly perform, or publicly digitally perform. You may not offer or impose any terms on the Derivative Works that alter or restrict the terms of this License or the recipients' exercise of the rights granted hereunder, and You must keep intact all notices that refer to this License and to the disclaimer of warranties. You may not distribute, publicly display, publicly perform, or publicly digitally perform the Derivative Work with any technological measures that control access or use of the Work in a manner inconsistent with the terms of this License Agreement. The above applies to the Derivative Work as incorporated in a Collective Work, but this does not require the Collective Work apart from the Derivative Work itself to be made subject to the terms of this License.
|
||||
|
||||
c. If you distribute, publicly display, publicly perform, or publicly digitally perform the Work or any Derivative Works or Collective Works, You must keep intact all copyright notices for the Work and give the Original Author credit reasonable to the medium or means You are utilizing by conveying the name (or pseudonym if applicable) of the Original Author if supplied; the title of the Work if supplied; to the extent reasonably practicable, the Uniform Resource Identifier, if any, that Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and in the case of a Derivative Work, a credit identifying the use of the Work in the Derivative Work (e.g., "French translation of the Work by Original Author," or "Screenplay based on original Work by Original Author"). Such credit may be implemented in any reasonable manner; provided, however, that in the case of a Derivative Work or Collective Work, at a minimum such credit will appear where any other comparable authorship credit appears and in a manner at least as prominent as such other comparable authorship credit.
|
||||
|
||||
5. Representations, Warranties and Disclaimer
|
||||
|
||||
UNLESS OTHERWISE AGREED TO BY THE PARTIES IN WRITING, LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE MATERIALS, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
|
||||
|
||||
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
|
||||
|
||||
7. Termination
|
||||
|
||||
a. This License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Derivative Works or Collective Works from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License.
|
||||
|
||||
b. Subject to the above terms and conditions, the license granted here is perpetual (for the duration of the applicable copyright in the Work). Notwithstanding the above, Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated above.
|
||||
|
||||
8. Miscellaneous
|
||||
|
||||
a. Each time You distribute or publicly digitally perform the Work or a Collective Work, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
|
||||
|
||||
b. Each time You distribute or publicly digitally perform a Derivative Work, Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
|
||||
|
||||
c. If any provision of this License is invalid or unenforceable under applicable law, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action by the parties to this agreement, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
|
||||
|
||||
d. No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the party to be charged with such waiver or consent.
|
||||
|
||||
e. This License constitutes the entire agreement between the parties with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
|
||||
|
||||
Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of Licensor.
|
||||
|
||||
Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, neither party will use the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time.
|
||||
|
||||
Creative Commons may be contacted at http://creativecommons.org/.
|
||||
14
package/dist/index.d.ts
vendored
Normal file
14
package/dist/index.d.ts
vendored
Normal file
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2021 Ludan Stoecklé
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
export type Genders = 'M' | 'F';
|
||||
export interface WordInfo {
|
||||
G: Genders | null;
|
||||
S?: string | null;
|
||||
P: string | null;
|
||||
}
|
||||
export interface WordsInfo {
|
||||
[key: string]: WordInfo;
|
||||
}
|
||||
8
package/dist/index.js
vendored
Normal file
8
package/dist/index.js
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
"use strict";
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2021 Ludan Stoecklé
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
Object.defineProperty(exports, "__esModule", { value: true });
|
||||
//# sourceMappingURL=index.js.map
|
||||
1
package/dist/index.js.map
vendored
Normal file
1
package/dist/index.js.map
vendored
Normal file
@@ -0,0 +1 @@
|
||||
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";AAAA;;;;GAIG"}
|
||||
415
package/dist/readme-morph-it.txt
vendored
Normal file
415
package/dist/readme-morph-it.txt
vendored
Normal file
@@ -0,0 +1,415 @@
|
||||
===================================================================
|
||||
Morph-it!
|
||||
|
||||
A free morphological lexicon for the Italian Language
|
||||
===================================================================
|
||||
|
||||
version 0.4.8
|
||||
February 23 2009
|
||||
|
||||
*******************************************************************
|
||||
THIS README IS NOT REALLY UP TO DATE
|
||||
A NEW VERSION OF THIS
|
||||
README FILE WILL BE
|
||||
RELEASED (HOPEFULLY) SOON
|
||||
(BUT I WOULDN'T COUNT ON THAT...)
|
||||
*******************************************************************
|
||||
|
||||
Copyright (c) 2004-2009
|
||||
Marco Baroni (marco.baroni@unitn.it)
|
||||
Eros Zanchetta (eros@sslmit.unibo.it)
|
||||
|
||||
http://sslmit.unibo.it/morphit
|
||||
|
||||
|
||||
Morph-it! is a free (as in free speech and in free beer) morphological
|
||||
resource for the Italian language.
|
||||
|
||||
Morph-it! is a lexicon of inflected forms with their lemma and
|
||||
morphological features. For example:
|
||||
|
||||
gattini gattino NOUN-M:p
|
||||
andarono andare VER:ind+past+3+p
|
||||
fastidiosetto fastidioso ADJ:dim+m+s
|
||||
|
||||
As of version 0.4.7 the list contains 504,906 entries and 34,968
|
||||
lemmas.
|
||||
|
||||
Morph-it! can be used as a data source for an Italian lemmatizer /
|
||||
morphological analyzer / morphological generator.
|
||||
|
||||
As example applications, on the Morph-it! site you can download the
|
||||
lexicon compiled for the SFST [1] and Finite State Utilities [2]
|
||||
packages.
|
||||
|
||||
The data for Morph-it! were prepared by Marco Baroni and Eros
|
||||
Zanchetta using a mixture of corpus-based methods,
|
||||
regular-expression-based rules and manual checking. We are currently
|
||||
writing a paper that describes the procedure we used to build the
|
||||
resource.
|
||||
|
||||
Morph-it! is still under development and there may still be gaps,
|
||||
unlikely forms, etc. We will be very grateful if you let us know
|
||||
about missing forms, problems, and ideas/resources that can help
|
||||
us expanding or cleaning the list (sslmitdevonline@sslmit.unibo.it).
|
||||
|
||||
Notice in particular that, since we extracted data from an Italian
|
||||
newspaper corpus (the la Repubblica corpus, also accessible from our
|
||||
site), we have many gaps in basic, every-day vocabulary.
|
||||
|
||||
Also, the current version does not distinguish between coordinative
|
||||
and subordinative conjunctions. We plan to do this in the near
|
||||
future. More in general, we are not fully satisfied with our current
|
||||
features for function words, and we plan to revise them.
|
||||
|
||||
A more ambitious plan we would like to pursue is the identification
|
||||
of derivational structure and derivationally related lemmas. Then, we
|
||||
will add full semantic representations. Then, we will take over the
|
||||
world and reign supreme for the next 100 years.
|
||||
|
||||
The remainder of this document contains a commented list of the
|
||||
morphological features used in the lexicon, licensing information and
|
||||
aknowledgments.
|
||||
|
||||
|
||||
FEATURES
|
||||
========
|
||||
|
||||
We distinguish between derivational features, that pertain to the
|
||||
lemma, and inflectional features, that pertain to the wordform.
|
||||
|
||||
Derivational and inflectional features are separated by a colon.
|
||||
|
||||
The derivational features are in upper case and they are
|
||||
dash-delimited. The inflectional features are in lower case and they
|
||||
are plus-sign-delimited.
|
||||
|
||||
For example, we represent gender as a derivational feature of nouns
|
||||
(we take "cameriere" and "cameriera" to belong to different lemmas),
|
||||
whereas we treat number as an inflectional feature of nouns. Thus,
|
||||
gender and number are represented as in the following examples:
|
||||
|
||||
cameriere cameriera NOUN-F:p
|
||||
cameriera cameriera NOUN-F:s
|
||||
camerieri cameriere NOUN-M:p
|
||||
cameriere cameriere NOUN-M:s
|
||||
|
||||
For adjectives, gender is considered an inflectional feature. Thus,
|
||||
gender is represented differently in adjectives and nouns:
|
||||
|
||||
azzurre azzurra NOUN-F:p
|
||||
azzurra azzurra NOUN-F:s
|
||||
azzurri azzurro NOUN-M:p
|
||||
azzurro azzurro NOUN-M:s
|
||||
|
||||
azzurra azzurro ADJ:pos+f+s
|
||||
azzurri azzurro ADJ:pos+m+p
|
||||
azzurro azzurro ADJ:pos+m+s
|
||||
azzurre azzurro ADJ:pos+f+p
|
||||
|
||||
Changes that are purely orthographical/phonological but do not affect
|
||||
morphology/syntax/meaning are not reflected in the features. For
|
||||
example, the following variants of "cento" share the same lemma and
|
||||
the same features:
|
||||
|
||||
cent' cento DET-NUM-CARD
|
||||
cento cento DET-NUM-CARD
|
||||
|
||||
We now present the full list of features we used, organized by major
|
||||
syntactic categories.
|
||||
|
||||
ABL
|
||||
|
||||
Abbreviated locutions, such as "a.C.", "ecc." and "i.e."
|
||||
|
||||
ADJ
|
||||
|
||||
Adjectives, with the following inflectional features:
|
||||
|
||||
pos/comp/sup
|
||||
|
||||
Thas is: positive, comparative, superlative. Although these are not
|
||||
true inflectional features, given their high productivity we decided
|
||||
to represent them as properties of inflected forms.
|
||||
|
||||
f/m
|
||||
|
||||
That is: feminine, masculine.
|
||||
|
||||
s/p
|
||||
|
||||
Thas is: singular, plural.
|
||||
|
||||
ADV
|
||||
|
||||
Adverbs.
|
||||
|
||||
ART
|
||||
|
||||
Articles, with gender as a derivational feature (F/M) and number as an
|
||||
inflectional feature (s/p).
|
||||
|
||||
ARTPRE
|
||||
|
||||
Preposition+article compounds ("col", "della", "nei"...), with gender
|
||||
as a derivational feature (F/M) and number as an inflectional feature
|
||||
(s/p).
|
||||
|
||||
ASP
|
||||
|
||||
Aspectuals ("stare" in "stare per"). Same inflectional features as VER
|
||||
(see below).
|
||||
|
||||
AUX
|
||||
|
||||
Auxiliaries ("essere", "avere", "venire"). Same inflectional features
|
||||
as VER (see below).
|
||||
|
||||
CAU
|
||||
|
||||
Causatives ("fare" in "far sapere"). Same inflectional features as VER
|
||||
(see below).
|
||||
|
||||
CE
|
||||
|
||||
Clitic "ce" as in "ce l'ho fatta".
|
||||
|
||||
CI
|
||||
|
||||
Clitic "ci" as in "ci prova".
|
||||
|
||||
CON
|
||||
|
||||
Conjunctions.
|
||||
|
||||
DET-DEMO
|
||||
|
||||
Demonstrative determiners (such as "questa" in "questa sera"), with
|
||||
inflectional gender (f/s) and number (s/p) features.
|
||||
|
||||
DET-INDEF
|
||||
|
||||
Indefinite determiners (such as "molti" in "molti amici") with
|
||||
inflectional gender (f/s) and number (s/p) features.
|
||||
|
||||
DET-NUM-CARD
|
||||
|
||||
Cardinal number determiners (e.g., "cinque" in "cinque
|
||||
amici"). Pure-digit numbers are not included (i.e., the list includes
|
||||
"100mila" but not "100000" nor "100,000", "100.000", etc.)
|
||||
|
||||
DET-POSS
|
||||
|
||||
Possessive determiners (e.g., "mio", "suo"), with inflectional gender
|
||||
(f/s) and number (s/p) features.
|
||||
|
||||
DET-WH
|
||||
|
||||
Wh determiners (e.g., quale in "quale amico"), with inflectional
|
||||
gender (f/s) and number (s/p) features.
|
||||
|
||||
INT
|
||||
|
||||
Interjections.
|
||||
|
||||
MOD
|
||||
|
||||
Modal verbs (e.g. "dover" in "dover ricostruire"). Same inflectional
|
||||
features as VER (see below).
|
||||
|
||||
NE
|
||||
|
||||
Clitic "ne" (as in: "ne hanno molte").
|
||||
|
||||
NOUN
|
||||
|
||||
Nouns, with gender as a derivational feature (F/M) and number as an
|
||||
inflectional feature (s/p).
|
||||
|
||||
PON
|
||||
|
||||
Non-sentential punctuation marks (e.g. , " $).
|
||||
|
||||
PRE
|
||||
|
||||
Prepositions.
|
||||
|
||||
PRO-DEMO
|
||||
|
||||
Demonstrative pronouns (e.g. "questa" in "voglio questa"), with both
|
||||
gender and number as derivational features (F/M, S/P).
|
||||
|
||||
PRO-INDEF
|
||||
|
||||
Indefinite pronouns (e.g., "molti" in "vengono molti"), with both
|
||||
gender and number as derivational features (F/M, S/P).
|
||||
|
||||
PRO-NUM
|
||||
|
||||
Numeral pronouns (e.g., "cinque" in "cinque sono
|
||||
sopravvissuti"). Pure-digit numbers are not included (e.g., the list
|
||||
includes "100mila" but not 100000 nor 100,000, 100.000, etc.)
|
||||
|
||||
PRO-PERS
|
||||
|
||||
Personal pronouns, such as "lui" and "loro". Clitic possessive
|
||||
pronouns (such as pronominal "lo" and "si") are marked by the
|
||||
derivational feature CLI. Person, gender and number are also encoded
|
||||
as derivational features (1/2/3, F/M, S/P).
|
||||
|
||||
PRO-POSS
|
||||
|
||||
Possessive pronouns, such as "loro" in "non era uno dei loro"), with
|
||||
gender and number encoded as derivational features (F/M, S/P).
|
||||
|
||||
PRO-WH
|
||||
|
||||
Wh-pronouns, such as "quale" in "quale e' venuto?"
|
||||
|
||||
SENT
|
||||
|
||||
End of sentence marker (! . ... : ?).
|
||||
|
||||
SI
|
||||
|
||||
Clitic "si" as in "di cui si discute".
|
||||
|
||||
TALE
|
||||
|
||||
"Tale" in constructions such as "una fortuna tale che...", "la tal
|
||||
cosa", "tali amici", ecc. Gender (f/m) and number (s/p) as
|
||||
inflectional features.
|
||||
|
||||
VER
|
||||
|
||||
Verbs, with the following inflectional features:
|
||||
|
||||
cond/ger/impr/ind/inf/part/sub
|
||||
|
||||
Conditional, gerundive, imperative, indicative, infinitive,
|
||||
participle, subjunctive.
|
||||
|
||||
pre/past/impf/fut
|
||||
|
||||
Present, past, imperfective, future.
|
||||
|
||||
1/2/3
|
||||
|
||||
Person.
|
||||
|
||||
s/p
|
||||
|
||||
Number.
|
||||
|
||||
f/m
|
||||
|
||||
Gender (only relevant for participles).
|
||||
|
||||
cela/cele/celi/celo/cene/ci/gli/gliela/gliele/glieli/glielo/gliene/la/
|
||||
le/li/lo/mela/mele/meli/melo/mene/mi/ne/sela/sele/seli/selo/sene/si/
|
||||
tela/tele/teli/telo/tene/ti/vela/vele/veli/velo/vene/vi
|
||||
|
||||
Clitics attached to the verb.
|
||||
|
||||
WH
|
||||
|
||||
Wh elements ("come", "qualora", "quando"...)
|
||||
|
||||
WH-CHE
|
||||
|
||||
"Che" as a wh element (e.g., "l'uomo che hai visto", "hai detto che").
|
||||
|
||||
|
||||
LICENSING INFORMATION
|
||||
======================
|
||||
|
||||
This program is dual-licensed free software; you can redistribute it
|
||||
and/or modify it under the terms of the under the Creative Commons
|
||||
Attribution ShareAlike 2.0 License and the GNU Lesser General Public
|
||||
License.
|
||||
|
||||
***********************************************
|
||||
* Creative Commons Attribution ShareAlike 2.0 *
|
||||
***********************************************
|
||||
|
||||
Morph-it! is licensed under the Creative Commons Attribution
|
||||
ShareAlike 2.0 License.
|
||||
|
||||
You are free:
|
||||
|
||||
- to copy, distribute and display the resource;
|
||||
- to make derivative works;
|
||||
- to make commercial use of the resource;
|
||||
|
||||
under the following conditions:
|
||||
|
||||
- you must give the original authors credit;
|
||||
- if you alter, transform, or build upon this work, you may distribute
|
||||
the resulting work only under a license identical to this one;
|
||||
- for any reuse or distribution, you must make clear to others the
|
||||
license terms of this work;
|
||||
- any of these conditions can be waived if you get permission from the
|
||||
copyright holders.
|
||||
|
||||
Your fair use and other rights are in no way affected by the above.
|
||||
|
||||
You can find a link to the full license from the Morph-it! website.
|
||||
|
||||
Copyright (C) 2004-2007 Marco Baroni and Eros Zanchetta.
|
||||
|
||||
*************************************
|
||||
* GNU Lesser General Public License *
|
||||
*************************************
|
||||
|
||||
Morph-it! A free morphological lexicon for the Italian Language
|
||||
Copyright (C) 2004-2007 Marco Baroni and Eros Zanchetta
|
||||
|
||||
This program is free software; you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation; either version 2 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License along
|
||||
with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
AKNOWLEDGMENTS
|
||||
==============
|
||||
|
||||
The main data source for the Morph-it! lexicon was the "la Repubblica"
|
||||
corpus. Thus, we would like to thank the colleagues who developed this
|
||||
resource with us: Lorenzo Piccioni, Guy Aston, Silvia Bernardini,
|
||||
Federica Comastri, Alessandra Volpi, Marco Mazzoleni.
|
||||
|
||||
We would like to thank the developers of the tools we used to tag,
|
||||
lemmatize and index the Repubblica corpus: the (Italian) TreeTagger
|
||||
(Helmut Schmid, Achim Stein), the ACOPOST taggers (Ingo Schroeder) and
|
||||
the IMS Corpus WorkBench (Oli Christ, Arne Fitschen and Stefan Evert).
|
||||
|
||||
Thanks to Helmut Schmid also for converting the Morph-it! lexicon into
|
||||
a SFST transducer.
|
||||
|
||||
We would like to thank Aldo Calpini, who developed the perl module
|
||||
Lingua:IT:Conjugate.
|
||||
|
||||
We are also very grateful to Jan Daciuk for creating his finite-state
|
||||
utilities and for helping us learn to use them.
|
||||
|
||||
Finally, a big thanks to the members of the FoLUG, SannioLUG and
|
||||
Scuola (software libero nella scuola) mailing lists, for advice about
|
||||
licensing and dissemination.
|
||||
|
||||
...and kudos to Lorenzo for creating and maintaining the SSLMITDev
|
||||
site!
|
||||
|
||||
|
||||
FOOTNOTES
|
||||
=========
|
||||
|
||||
[1] http://www.ims.uni-stuttgart.de/projekte/gramotron/SOFTWARE/SFST.html
|
||||
[2] http://juggernaut.eti.pg.gda.pl/~jandac/fsa.html
|
||||
1
package/dist/words.json
vendored
Normal file
1
package/dist/words.json
vendored
Normal file
File diff suppressed because one or more lines are too long
21
package/gulpfile.js
Normal file
21
package/gulpfile.js
Normal file
@@ -0,0 +1,21 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2019 Ludan Stoecklé
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const { processItalianWords } = require('./dist/create/createList');
|
||||
const { series } = require('gulp');
|
||||
|
||||
function createWords(cb) {
|
||||
processItalianWords('resources/morph-it_048.txt', 'dist/words.json', cb);
|
||||
}
|
||||
|
||||
function copyLicences(cb) {
|
||||
fs.copyFileSync('./resources/CC-BY-SA-2.0.txt', './dist/CC-BY-SA-2.0.txt');
|
||||
fs.copyFileSync('./resources/readme-morph-it.txt', './dist/readme-morph-it.txt');
|
||||
cb();
|
||||
}
|
||||
|
||||
exports.build = series(createWords, copyLicences);
|
||||
23
package/package.json
Normal file
23
package/package.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"name": "italian-words-dict",
|
||||
"version": "3.4.0",
|
||||
"description": "Italian words dictionnary, based on the morph-it linguistic resource",
|
||||
"main": "dist/index.js",
|
||||
"scripts": {
|
||||
"clean": "rm -rf dist",
|
||||
"test": "nyc --reporter=lcov --reporter=text mocha",
|
||||
"build": "tsc && gulp build"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/RosaeNLG/rosaenlg.git"
|
||||
},
|
||||
"keywords": [
|
||||
"words",
|
||||
"Italian",
|
||||
"morph-it"
|
||||
],
|
||||
"author": "Ludan Stoecklé <ludan.stoeckle@rosaenlg.org>",
|
||||
"license": "Apache-2.0",
|
||||
"gitHead": "745dc50c54690936fba332ca465308c607053e46"
|
||||
}
|
||||
21
package/test/test.js
Normal file
21
package/test/test.js
Normal file
@@ -0,0 +1,21 @@
|
||||
/**
|
||||
* @license
|
||||
* Copyright 2019 Ludan Stoecklé
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*/
|
||||
|
||||
const assert = require('assert');
|
||||
const italianWords = require('../dist/words.json');
|
||||
|
||||
describe('italian-words-dict', function () {
|
||||
it('should contain something', function () {
|
||||
assert(italianWords != null);
|
||||
assert(Object.keys(italianWords).length > 100);
|
||||
});
|
||||
it('pizza should be ok', function () {
|
||||
const pizza = italianWords['pizza'];
|
||||
assert(pizza != null);
|
||||
assert.strictEqual(pizza['G'], 'F');
|
||||
assert.strictEqual(pizza['P'], 'pizze');
|
||||
});
|
||||
});
|
||||
473
refine_lexicon_topics.py
Normal file
473
refine_lexicon_topics.py
Normal file
@@ -0,0 +1,473 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Tuple
|
||||
|
||||
from build_enriched_lexicon import ENRICHED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REFINED_LEXICON_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined.json")
|
||||
|
||||
TOPIC_KEYWORDS: Dict[str, Tuple[str, ...]] = {
|
||||
"religion": (
|
||||
"abbazia",
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"cappella",
|
||||
"cardinale",
|
||||
"chiesa",
|
||||
"clero",
|
||||
"convento",
|
||||
"diocesi",
|
||||
"ecclesiast",
|
||||
"fede",
|
||||
"frate",
|
||||
"mistica",
|
||||
"monaco",
|
||||
"monastero",
|
||||
"parrocchia",
|
||||
"prete",
|
||||
"religion",
|
||||
"sacerdot",
|
||||
"santo",
|
||||
"vescovo",
|
||||
),
|
||||
"ecclesiastical_hierarchy": (
|
||||
"abate",
|
||||
"arcivescovo",
|
||||
"carica ecclesiastica",
|
||||
"cardinale",
|
||||
"clero",
|
||||
"dignità ecclesiastica",
|
||||
"ecclesiast",
|
||||
"ordinazione",
|
||||
"parroco",
|
||||
"patriarca",
|
||||
"pontefice",
|
||||
"prete",
|
||||
"priore",
|
||||
"superiore del monastero",
|
||||
"vescovo",
|
||||
),
|
||||
"honorific_title": (
|
||||
"carica",
|
||||
"epiteto",
|
||||
"nobile",
|
||||
"onore",
|
||||
"onorific",
|
||||
"titolo",
|
||||
),
|
||||
"mysticism": (
|
||||
"asceta",
|
||||
"contemplazione",
|
||||
"estasi",
|
||||
"mistica",
|
||||
"mistico",
|
||||
"monachesimo",
|
||||
"spiritual",
|
||||
),
|
||||
"geography": (
|
||||
"borgo",
|
||||
"città",
|
||||
"comune",
|
||||
"frazione",
|
||||
"geografia",
|
||||
"isola",
|
||||
"località",
|
||||
"paese",
|
||||
"provincia",
|
||||
"regione",
|
||||
"stato",
|
||||
"toponimo",
|
||||
"valle",
|
||||
),
|
||||
"transport": (
|
||||
"aereo",
|
||||
"aeroplano",
|
||||
"auto",
|
||||
"autobus",
|
||||
"autocarro",
|
||||
"barca",
|
||||
"bicicletta",
|
||||
"imbarcazione",
|
||||
"locomotiva",
|
||||
"motore",
|
||||
"nave",
|
||||
"pista",
|
||||
"porto",
|
||||
"stazione",
|
||||
"traghetto",
|
||||
"treno",
|
||||
"trasport",
|
||||
"veicolo",
|
||||
"viaggio",
|
||||
),
|
||||
"nature": (
|
||||
"acqua",
|
||||
"albero",
|
||||
"animale",
|
||||
"bosco",
|
||||
"fiore",
|
||||
"fiume",
|
||||
"foresta",
|
||||
"mare",
|
||||
"montagna",
|
||||
"natura",
|
||||
"pianta",
|
||||
"terra",
|
||||
),
|
||||
"health": (
|
||||
"ambulanza",
|
||||
"anemia",
|
||||
"cura",
|
||||
"farmaco",
|
||||
"malattia",
|
||||
"medic",
|
||||
"ospedale",
|
||||
"paziente",
|
||||
"salute",
|
||||
"soccorso",
|
||||
"terapia",
|
||||
),
|
||||
"war": (
|
||||
"arma",
|
||||
"artiglieria",
|
||||
"assalto",
|
||||
"battaglia",
|
||||
"bombard",
|
||||
"esercito",
|
||||
"fortezza",
|
||||
"guerra",
|
||||
"militare",
|
||||
"soldato",
|
||||
"trincea",
|
||||
),
|
||||
}
|
||||
|
||||
TAG_STOPWORDS = {
|
||||
"and",
|
||||
"con",
|
||||
"da",
|
||||
"dei",
|
||||
"del",
|
||||
"della",
|
||||
"delle",
|
||||
"dello",
|
||||
"di",
|
||||
"e",
|
||||
"il",
|
||||
"in",
|
||||
"la",
|
||||
"le",
|
||||
"lo",
|
||||
"nel",
|
||||
"nella",
|
||||
"per",
|
||||
"su",
|
||||
"the",
|
||||
"un",
|
||||
"una",
|
||||
}
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Genera un lessico raffinato con campi aggiuntivi per topic, tag semantici e sensi."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=ENRICHED_LEXICON_OUTPUT_PATH,
|
||||
help="File lessicale di partenza, tipicamente lexicon_it_enriched.json.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Nuovo file lessicale raffinato da generare.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--replace-general",
|
||||
action="store_true",
|
||||
help="Se attivo, sostituisce topic=['general'] con i topic suggeriti quando la confidenza e alta.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-topic-score",
|
||||
type=int,
|
||||
default=40,
|
||||
help="Punteggio minimo per promuovere un topic suggerito nei topics finali.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path) -> Dict[str, object]:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: Dict[str, object]) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def dedupe(items: Iterable[str]) -> List[str]:
|
||||
result: List[str] = []
|
||||
seen = set()
|
||||
for item in items:
|
||||
text = str(item).strip()
|
||||
if not text:
|
||||
continue
|
||||
key = text.lower()
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
result.append(text)
|
||||
return result
|
||||
|
||||
|
||||
def slugify_tag(text: str) -> str:
|
||||
value = re.sub(r"[^a-z0-9]+", "_", text.strip().lower(), flags=re.IGNORECASE)
|
||||
value = value.strip("_")
|
||||
return value
|
||||
|
||||
|
||||
def cleanup_tag(tag: str) -> str:
|
||||
normalized = slugify_tag(tag)
|
||||
if not normalized or normalized in TAG_STOPWORDS or len(normalized) <= 1:
|
||||
return ""
|
||||
return normalized
|
||||
|
||||
|
||||
def flatten_text(entry: Dict[str, object]) -> str:
|
||||
chunks: List[str] = []
|
||||
chunks.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
chunks.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
chunks.extend(str(gloss) for gloss in semantic.get("glosses", []) or [])
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.append(str(synset.get("definition", "")))
|
||||
chunks.extend(str(item) for item in synset.get("lemmas", []) or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
chunks.extend(str(item) for item in babelnet.get("synset_refs", []) or [])
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
chunks.extend(str(item) for item in best_synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in best_synset.get("senses", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
chunks.extend(str(item) for item in synset.get("glosses", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
chunks.extend(str(item) for item in synset.get("senses", []) or [])
|
||||
|
||||
return " ".join(chunks).lower()
|
||||
|
||||
|
||||
def infer_topic_scores(entry: Dict[str, object]) -> Dict[str, int]:
|
||||
text = flatten_text(entry)
|
||||
scores: Dict[str, int] = {}
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
score = 0
|
||||
for keyword in keywords:
|
||||
occurrences = text.count(keyword.lower())
|
||||
if occurrences:
|
||||
score += 12 * occurrences
|
||||
if score:
|
||||
scores[topic] = min(score, 100)
|
||||
return scores
|
||||
|
||||
|
||||
def collect_semantic_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
tags.extend(str(topic) for topic in entry.get("topics", []) or [])
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
tags.extend(str(topic) for topic in semantic.get("semantic_topics", []) or [])
|
||||
for relation_group in (semantic.get("raw_relation_terms", {}) or {}).values():
|
||||
tags.extend(str(item) for item in relation_group or [])
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict):
|
||||
tags.extend(str(item) for item in best_synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in best_synset.get("domains", []) or [])
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if isinstance(synset, dict):
|
||||
tags.extend(str(item) for item in synset.get("categories", []) or [])
|
||||
tags.extend(str(item) for item in synset.get("domains", []) or [])
|
||||
|
||||
cleaned = [cleanup_tag(tag) for tag in tags]
|
||||
return [tag for tag in dedupe(cleaned) if tag]
|
||||
|
||||
|
||||
def collect_senses(entry: Dict[str, object], topic_scores: Dict[str, int]) -> List[Dict[str, object]]:
|
||||
senses: List[Dict[str, object]] = []
|
||||
|
||||
semantic = entry.get("semantic", {})
|
||||
if isinstance(semantic, dict):
|
||||
for synset in semantic.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
definition = str(synset.get("definition", "")).strip()
|
||||
if not definition:
|
||||
continue
|
||||
senses.append(
|
||||
{
|
||||
"source": "semantic",
|
||||
"id": synset.get("id"),
|
||||
"definition": definition,
|
||||
"lemmas": dedupe(str(item) for item in synset.get("lemmas", []) or []),
|
||||
"topics": dedupe(
|
||||
list(semantic.get("semantic_topics", []) or [])
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": 0.7,
|
||||
}
|
||||
)
|
||||
|
||||
babelnet = entry.get("babelnet", {})
|
||||
if isinstance(babelnet, dict):
|
||||
best_synset = babelnet.get("best_synset", {})
|
||||
if isinstance(best_synset, dict) and best_synset.get("id"):
|
||||
glosses = [str(item).strip() for item in best_synset.get("glosses", []) or [] if str(item).strip()]
|
||||
if glosses:
|
||||
senses.append(
|
||||
{
|
||||
"source": "babelnet",
|
||||
"id": best_synset.get("id"),
|
||||
"definition": glosses[0],
|
||||
"lemmas": dedupe(str(item) for item in best_synset.get("senses", []) or []),
|
||||
"topics": dedupe(
|
||||
[str(best_synset.get("topic", "")).strip()]
|
||||
+ [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
),
|
||||
"confidence": round(min(max(float(best_synset.get("topic_score", 0)) / 100.0, 0.4), 0.95), 2),
|
||||
}
|
||||
)
|
||||
|
||||
return senses
|
||||
|
||||
|
||||
def collect_geo_tags(entry: Dict[str, object]) -> List[str]:
|
||||
babelnet = entry.get("babelnet", {})
|
||||
tags: List[str] = []
|
||||
if isinstance(babelnet, dict):
|
||||
for synset in babelnet.get("synsets", []) or []:
|
||||
if not isinstance(synset, dict):
|
||||
continue
|
||||
for category in synset.get("categories", []) or []:
|
||||
text = str(category).lower()
|
||||
if any(keyword in text for keyword in ("comuni_", "province_", "regioni_", "città", "paesi", "località")):
|
||||
tags.append("toponym_possible")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def collect_name_tags(entry: Dict[str, object]) -> List[str]:
|
||||
tags: List[str] = []
|
||||
form = str(entry.get("form", ""))
|
||||
if form[:1].isupper():
|
||||
tags.append("capitalized_form")
|
||||
return dedupe(tags)
|
||||
|
||||
|
||||
def should_review(entry: Dict[str, object], topic_scores: Dict[str, int], senses: List[Dict[str, object]]) -> bool:
|
||||
existing_topics = [str(topic).lower() for topic in entry.get("topics", []) or []]
|
||||
best_score = max(topic_scores.values(), default=0)
|
||||
strong_topics = [topic for topic, score in topic_scores.items() if score >= 50]
|
||||
babelnet_status = str((entry.get("babelnet", {}) or {}).get("status", ""))
|
||||
|
||||
if existing_topics == ["general"] and not strong_topics:
|
||||
return True
|
||||
if babelnet_status == "ambiguous" and best_score < 50:
|
||||
return True
|
||||
if len(senses) >= 3 and len(strong_topics) >= 2:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def promoted_topics(
|
||||
existing_topics: List[str], topic_scores: Dict[str, int], replace_general: bool, min_topic_score: int
|
||||
) -> List[str]:
|
||||
inferred = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0])) if score >= min_topic_score]
|
||||
existing_clean = dedupe(existing_topics)
|
||||
|
||||
if replace_general and existing_clean == ["general"] and inferred:
|
||||
return inferred
|
||||
|
||||
return dedupe(existing_clean + inferred)
|
||||
|
||||
|
||||
def refine_entry(entry: Dict[str, object], replace_general: bool, min_topic_score: int) -> Dict[str, object]:
|
||||
refined = deepcopy(entry)
|
||||
topic_scores = infer_topic_scores(refined)
|
||||
semantic_tags = collect_semantic_tags(refined)
|
||||
senses = collect_senses(refined, topic_scores)
|
||||
geo_tags = collect_geo_tags(refined)
|
||||
name_tags = collect_name_tags(refined)
|
||||
current_topics = [str(topic) for topic in refined.get("topics", []) or []]
|
||||
|
||||
refined["topics"] = promoted_topics(current_topics, topic_scores, replace_general, min_topic_score)
|
||||
refined["semantic_tags"] = semantic_tags
|
||||
refined["senses"] = senses
|
||||
refined["topic_confidence"] = topic_scores
|
||||
refined["topic_suggestions"] = [topic for topic, score in sorted(topic_scores.items(), key=lambda item: (-item[1], item[0]))]
|
||||
refined["geo_tags"] = geo_tags
|
||||
refined["name_tags"] = name_tags
|
||||
refined["needs_review"] = should_review(refined, topic_scores, senses)
|
||||
return refined
|
||||
|
||||
|
||||
def build_refined_lexicon(args: argparse.Namespace) -> Dict[str, object]:
|
||||
payload = load_json(args.input)
|
||||
if not isinstance(payload, dict) or "entries" not in payload:
|
||||
raise ValueError(f"Lessico di input non valido: {args.input}")
|
||||
|
||||
refined_entries = [
|
||||
refine_entry(entry, args.replace_general, args.min_topic_score)
|
||||
for entry in payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
]
|
||||
|
||||
review_count = sum(1 for entry in refined_entries if entry.get("needs_review"))
|
||||
topicful_count = sum(1 for entry in refined_entries if len(entry.get("topic_suggestions", []) or []) > 0)
|
||||
|
||||
return {
|
||||
"meta": {
|
||||
"language": "it",
|
||||
"version": 1,
|
||||
"base_lexicon": args.input.name,
|
||||
"generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"entry_count": len(refined_entries),
|
||||
"replace_general": args.replace_general,
|
||||
"min_topic_score": args.min_topic_score,
|
||||
"review_count": review_count,
|
||||
"topicful_count": topicful_count,
|
||||
},
|
||||
"entries": refined_entries,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
payload = build_refined_lexicon(args)
|
||||
write_json(args.output, payload)
|
||||
print(f"Lessico raffinato generato: {args.output}")
|
||||
print(f"Voci totali: {payload['meta']['entry_count']}")
|
||||
print(f"Voci con suggerimenti di topic: {payload['meta']['topicful_count']}")
|
||||
print(f"Voci marcate needs_review: {payload['meta']['review_count']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
5
run_babelnet_daily_batch.bat
Normal file
5
run_babelnet_daily_batch.bat
Normal file
@@ -0,0 +1,5 @@
|
||||
@echo off
|
||||
setlocal
|
||||
cd /d "%~dp0"
|
||||
python babelnet_daily_batch.py --api-call-limit 1900 --per-key-api-call-limit 950 --sleep 0.2
|
||||
endlocal
|
||||
558
vocaboli_it.txt
Normal file
558
vocaboli_it.txt
Normal file
@@ -0,0 +1,558 @@
|
||||
adesso
|
||||
adige
|
||||
adone
|
||||
agave
|
||||
agile
|
||||
ago
|
||||
aiuto
|
||||
alba
|
||||
albero
|
||||
alga
|
||||
alito
|
||||
alloro
|
||||
aloe
|
||||
alpaca
|
||||
alta
|
||||
amaca
|
||||
amare
|
||||
ambra
|
||||
ameno
|
||||
amico
|
||||
amore
|
||||
anatra
|
||||
anello
|
||||
angelo
|
||||
anice
|
||||
anima
|
||||
anno
|
||||
antenna
|
||||
aprire
|
||||
aratro
|
||||
arco
|
||||
arena
|
||||
argento
|
||||
aria
|
||||
arpa
|
||||
arredo
|
||||
arrivo
|
||||
arte
|
||||
asilo
|
||||
asino
|
||||
aspro
|
||||
asta
|
||||
atomo
|
||||
attesa
|
||||
auguri
|
||||
aula
|
||||
aurora
|
||||
autore
|
||||
avena
|
||||
avere
|
||||
avviso
|
||||
azione
|
||||
azzurro
|
||||
bacio
|
||||
bagaglio
|
||||
balcone
|
||||
banco
|
||||
barca
|
||||
barone
|
||||
base
|
||||
basso
|
||||
bastone
|
||||
becco
|
||||
bello
|
||||
bene
|
||||
biondo
|
||||
biscia
|
||||
blocco
|
||||
borsa
|
||||
bosco
|
||||
breve
|
||||
brina
|
||||
bronzo
|
||||
bruco
|
||||
buio
|
||||
burro
|
||||
cacao
|
||||
cadere
|
||||
calamaio
|
||||
caldo
|
||||
calice
|
||||
camera
|
||||
camino
|
||||
campana
|
||||
canale
|
||||
candela
|
||||
capace
|
||||
capello
|
||||
capire
|
||||
capra
|
||||
carbone
|
||||
carta
|
||||
casale
|
||||
cassone
|
||||
castoro
|
||||
cavallo
|
||||
cedere
|
||||
celeste
|
||||
cena
|
||||
centro
|
||||
cerchio
|
||||
certezza
|
||||
cervo
|
||||
chiaro
|
||||
chitarra
|
||||
cielo
|
||||
cifra
|
||||
cigno
|
||||
cima
|
||||
cintura
|
||||
circolo
|
||||
cittadino
|
||||
classe
|
||||
clima
|
||||
collina
|
||||
colore
|
||||
cometa
|
||||
comune
|
||||
conca
|
||||
condurre
|
||||
confine
|
||||
coniglio
|
||||
conto
|
||||
corda
|
||||
corona
|
||||
cortile
|
||||
cosa
|
||||
costa
|
||||
creare
|
||||
crescere
|
||||
crinale
|
||||
croce
|
||||
cuore
|
||||
cura
|
||||
dado
|
||||
danza
|
||||
dare
|
||||
debole
|
||||
decoro
|
||||
denso
|
||||
dente
|
||||
deserto
|
||||
destino
|
||||
detto
|
||||
dialogo
|
||||
difesa
|
||||
digitale
|
||||
dipingere
|
||||
diritto
|
||||
divano
|
||||
docile
|
||||
dogana
|
||||
dolce
|
||||
domanda
|
||||
dono
|
||||
dormire
|
||||
dorso
|
||||
drago
|
||||
ebano
|
||||
eco
|
||||
edera
|
||||
educare
|
||||
effetto
|
||||
elica
|
||||
elogio
|
||||
elmo
|
||||
energia
|
||||
enorme
|
||||
entrare
|
||||
epoca
|
||||
equatore
|
||||
erba
|
||||
erede
|
||||
eroe
|
||||
errore
|
||||
esame
|
||||
esilio
|
||||
esistere
|
||||
esito
|
||||
eterno
|
||||
etica
|
||||
fare
|
||||
favola
|
||||
febbre
|
||||
felice
|
||||
fermare
|
||||
ferro
|
||||
festa
|
||||
fiaba
|
||||
fiducia
|
||||
figura
|
||||
filo
|
||||
finale
|
||||
fiume
|
||||
fiore
|
||||
firmare
|
||||
flauto
|
||||
foglia
|
||||
fonte
|
||||
forza
|
||||
fosso
|
||||
frase
|
||||
freccia
|
||||
freno
|
||||
frutto
|
||||
fuga
|
||||
fumo
|
||||
fuoco
|
||||
futuro
|
||||
gabbiano
|
||||
galassia
|
||||
gamba
|
||||
gatto
|
||||
gelato
|
||||
gemma
|
||||
geniale
|
||||
gesto
|
||||
giallo
|
||||
giardino
|
||||
girare
|
||||
giudice
|
||||
giorno
|
||||
giovane
|
||||
giubba
|
||||
giugno
|
||||
globo
|
||||
goccia
|
||||
gomito
|
||||
grado
|
||||
grammo
|
||||
grande
|
||||
grano
|
||||
gravare
|
||||
greto
|
||||
guadagno
|
||||
guanto
|
||||
guida
|
||||
guscio
|
||||
idea
|
||||
idolo
|
||||
illeso
|
||||
impero
|
||||
impronta
|
||||
incanto
|
||||
incontro
|
||||
indicare
|
||||
indole
|
||||
inerzia
|
||||
infinito
|
||||
inizio
|
||||
inno
|
||||
insalata
|
||||
insieme
|
||||
intesa
|
||||
invito
|
||||
isola
|
||||
istante
|
||||
labbro
|
||||
lago
|
||||
lana
|
||||
largo
|
||||
lastra
|
||||
latte
|
||||
laurea
|
||||
lavare
|
||||
legame
|
||||
legenda
|
||||
leggere
|
||||
legno
|
||||
lente
|
||||
lezione
|
||||
libellula
|
||||
limite
|
||||
linea
|
||||
liquido
|
||||
liscia
|
||||
litigare
|
||||
livello
|
||||
locale
|
||||
lodo
|
||||
lontano
|
||||
lotta
|
||||
lucente
|
||||
luce
|
||||
luna
|
||||
lupo
|
||||
macchia
|
||||
madre
|
||||
maestro
|
||||
magnete
|
||||
magro
|
||||
maiolica
|
||||
mandorla
|
||||
maniglia
|
||||
mano
|
||||
mare
|
||||
margine
|
||||
martello
|
||||
maschera
|
||||
massa
|
||||
materia
|
||||
medaglia
|
||||
melodia
|
||||
memoria
|
||||
menta
|
||||
merito
|
||||
metallo
|
||||
metodo
|
||||
mezzo
|
||||
miraggio
|
||||
misura
|
||||
modello
|
||||
moderno
|
||||
momento
|
||||
mondo
|
||||
montone
|
||||
morbido
|
||||
mordere
|
||||
mosaico
|
||||
motore
|
||||
muovere
|
||||
nascere
|
||||
nastro
|
||||
nave
|
||||
nebbia
|
||||
neutro
|
||||
nocciola
|
||||
nome
|
||||
notare
|
||||
notizia
|
||||
nuvola
|
||||
oblio
|
||||
odore
|
||||
offerta
|
||||
ombra
|
||||
onda
|
||||
onesto
|
||||
opera
|
||||
opinione
|
||||
ordine
|
||||
oriente
|
||||
origine
|
||||
oro
|
||||
orso
|
||||
ortica
|
||||
ospite
|
||||
ovale
|
||||
ovest
|
||||
padre
|
||||
palazzo
|
||||
palude
|
||||
pane
|
||||
parete
|
||||
parlare
|
||||
partita
|
||||
passero
|
||||
patto
|
||||
paura
|
||||
pedana
|
||||
pellicola
|
||||
pensare
|
||||
perla
|
||||
persona
|
||||
pesare
|
||||
pianeta
|
||||
pianta
|
||||
pietra
|
||||
pigro
|
||||
pilota
|
||||
piuma
|
||||
piuttosto
|
||||
plastica
|
||||
poesia
|
||||
polline
|
||||
ponte
|
||||
popolo
|
||||
porta
|
||||
pozzo
|
||||
pranzo
|
||||
pregio
|
||||
premio
|
||||
presa
|
||||
primato
|
||||
principe
|
||||
prisma
|
||||
produrre
|
||||
profilo
|
||||
profumo
|
||||
progetto
|
||||
promessa
|
||||
pronto
|
||||
prova
|
||||
prudente
|
||||
quaderno
|
||||
quercia
|
||||
questione
|
||||
quota
|
||||
radice
|
||||
ragione
|
||||
ramo
|
||||
rapido
|
||||
rasoio
|
||||
reale
|
||||
regola
|
||||
respiro
|
||||
restare
|
||||
rete
|
||||
ricamo
|
||||
ricerca
|
||||
riccio
|
||||
ricordo
|
||||
ridere
|
||||
riflesso
|
||||
riga
|
||||
rigore
|
||||
rimanere
|
||||
rimedio
|
||||
riparo
|
||||
ripetere
|
||||
riposo
|
||||
ritmo
|
||||
ritorno
|
||||
riva
|
||||
roccia
|
||||
rompere
|
||||
rosa
|
||||
rotazione
|
||||
rotta
|
||||
rubino
|
||||
ruga
|
||||
rumore
|
||||
ruota
|
||||
salire
|
||||
salone
|
||||
saltare
|
||||
salute
|
||||
sapere
|
||||
sasso
|
||||
sedia
|
||||
segnale
|
||||
segreto
|
||||
selva
|
||||
seme
|
||||
sentire
|
||||
sereno
|
||||
serpente
|
||||
servire
|
||||
sestante
|
||||
settore
|
||||
sfera
|
||||
sfida
|
||||
sguardo
|
||||
silenzio
|
||||
simbolo
|
||||
sincero
|
||||
slancio
|
||||
smeraldo
|
||||
soglia
|
||||
solare
|
||||
solido
|
||||
soltanto
|
||||
sonno
|
||||
sopra
|
||||
sorgere
|
||||
sorriso
|
||||
sospeso
|
||||
sosta
|
||||
spada
|
||||
spazio
|
||||
specchio
|
||||
spessore
|
||||
spiga
|
||||
spirito
|
||||
sponda
|
||||
sportivo
|
||||
sprone
|
||||
stabile
|
||||
stagione
|
||||
stella
|
||||
stelo
|
||||
stendere
|
||||
stile
|
||||
stima
|
||||
storia
|
||||
strada
|
||||
studiare
|
||||
subito
|
||||
suono
|
||||
superare
|
||||
tacere
|
||||
talento
|
||||
tappeto
|
||||
tavolo
|
||||
teatro
|
||||
tecnica
|
||||
telaio
|
||||
tempo
|
||||
tendere
|
||||
tenere
|
||||
tensione
|
||||
terra
|
||||
tetto
|
||||
tigre
|
||||
timore
|
||||
titolo
|
||||
tornare
|
||||
torre
|
||||
traccia
|
||||
tradurre
|
||||
trama
|
||||
trasporto
|
||||
trattare
|
||||
treno
|
||||
triangolo
|
||||
trionfo
|
||||
trovare
|
||||
tulipano
|
||||
turbine
|
||||
udire
|
||||
ulivo
|
||||
umile
|
||||
unione
|
||||
urbano
|
||||
usanza
|
||||
uscire
|
||||
utile
|
||||
valore
|
||||
variare
|
||||
vasca
|
||||
vecchio
|
||||
vedetta
|
||||
vela
|
||||
veloce
|
||||
vendere
|
||||
vento
|
||||
verita
|
||||
vernice
|
||||
versare
|
||||
viaggio
|
||||
vicenda
|
||||
vicino
|
||||
vigore
|
||||
villaggio
|
||||
viola
|
||||
virgola
|
||||
virtu
|
||||
visione
|
||||
vistoso
|
||||
vita
|
||||
vivere
|
||||
vocazione
|
||||
voce
|
||||
volere
|
||||
volpe
|
||||
zaino
|
||||
zefiro
|
||||
zolla
|
||||
zucchero
|
||||
17311
vocaboli_it_esteso.txt
Normal file
17311
vocaboli_it_esteso.txt
Normal file
File diff suppressed because it is too large
Load Diff
16357
vocaboli_it_filtrato.txt
Normal file
16357
vocaboli_it_filtrato.txt
Normal file
File diff suppressed because it is too large
Load Diff
125845
vocaboli_it_metadata.json
Normal file
125845
vocaboli_it_metadata.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user