alpha01 backoffice: crossword engine, lexicon curation and JSON contract
This commit is contained in:
678
enrich_review_from_wiktionary.py
Normal file
678
enrich_review_from_wiktionary.py
Normal file
@@ -0,0 +1,678 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from copy import deepcopy
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
|
||||
from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH
|
||||
|
||||
|
||||
REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json")
|
||||
WIKTIONARY_CACHE_PATH = Path(__file__).with_name(".wiktionary_cache.json")
|
||||
WIKTIONARY_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktionary.json")
|
||||
WIKTIONARY_API_URL = "https://it.wiktionary.org/w/api.php"
|
||||
|
||||
DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"}
|
||||
|
||||
POS_ALIASES = {
|
||||
"sostantivo": "NOUN",
|
||||
"nome": "NOUN",
|
||||
"sost": "NOUN",
|
||||
"aggettivo": "ADJ",
|
||||
"agg": "ADJ",
|
||||
"verbo": "VERB",
|
||||
"verb": "VERB",
|
||||
"verb form": "VERB_FORM",
|
||||
"avverbio": "ADV",
|
||||
"avv": "ADV",
|
||||
"preposizione": "PREP",
|
||||
"prep": "PREP",
|
||||
"congiunzione": "CONJ",
|
||||
"cong": "CONJ",
|
||||
"pronome": "PRON",
|
||||
"pron": "PRON",
|
||||
"articolo": "ART",
|
||||
"interiezione": "INTJ",
|
||||
"inter": "INTJ",
|
||||
"locuzione": "PHRASE",
|
||||
"loc": "PHRASE",
|
||||
}
|
||||
|
||||
TOPIC_KEYWORDS = {
|
||||
"religion": ("religione", "cattolic", "sacro", "sacra", "devozion", "scapolare", "abbazia", "monastero"),
|
||||
"clothing": ("abito", "vestito", "vestit", "abbigliamento", "indumento", "stoffa"),
|
||||
"grammar": ("diminutivo", "voce verbale", "congiuntivo", "plurale", "singolare", "grammatica", "verbo"),
|
||||
"geography": ("comune", "paese", "regione", "provincia", "citta", "localita", "frazione"),
|
||||
"transport": ("veicolo", "motore", "treno", "aereo", "trasporto", "nave", "imbarcazione"),
|
||||
"health": ("medicina", "ospedale", "malattia", "cura", "feriti", "ammalati", "sanitario"),
|
||||
}
|
||||
|
||||
GRAMMAR_KEYWORDS = (
|
||||
"diminutivo",
|
||||
"accrescitivo",
|
||||
"peggiorativo",
|
||||
"alterato",
|
||||
"voce verbale",
|
||||
"congiuntivo",
|
||||
"participio",
|
||||
"plurale",
|
||||
"singolare",
|
||||
"maschile",
|
||||
"femminile",
|
||||
)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Arricchisce le voci problematiche del lessico refined con definizioni e metadati "
|
||||
"estratti da it.wiktionary.org."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input",
|
||||
type=Path,
|
||||
default=REFINED_LEXICON_OUTPUT_PATH,
|
||||
help="Lessico refined di partenza.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review",
|
||||
type=Path,
|
||||
default=REVIEW_INPUT_PATH,
|
||||
help="File to_be_review.json da usare per selezionare le voci prioritarie.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=Path,
|
||||
default=WIKTIONARY_OUTPUT_PATH,
|
||||
help="Nuovo lessico con blocco wiktionary aggiunto.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cache",
|
||||
type=Path,
|
||||
default=WIKTIONARY_CACHE_PATH,
|
||||
help="Cache locale delle risposte Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--word-limit",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Limite massimo di parole da elaborare. 0 = tutte le candidate.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sleep",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Pausa tra le richieste HTTP a Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--save-every",
|
||||
type=int,
|
||||
default=25,
|
||||
help="Salva cache e output ogni N parole elaborate per non perdere progresso.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retry-429",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Numero massimo di tentativi aggiuntivi se Wiktionary risponde HTTP 429.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--backoff-429",
|
||||
type=float,
|
||||
default=30.0,
|
||||
help="Secondi di attesa iniziali dopo un HTTP 429; raddoppiano a ogni nuovo tentativo.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stop-on-429",
|
||||
action="store_true",
|
||||
help="Se attivo, al primo HTTP 429 salva lo stato e interrompe il batch senza altri tentativi.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--words",
|
||||
default="",
|
||||
help="Lista separata da virgole di lemmi specifici da arricchire.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--review-reasons",
|
||||
default=",".join(sorted(DEFAULT_REVIEW_REASONS)),
|
||||
help="Motivi del file review da trattare con priorita, separati da virgole.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
default=WIKTIONARY_API_URL,
|
||||
help="Endpoint MediaWiki Action API di Wiktionary.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-existing",
|
||||
action="store_true",
|
||||
help="Salta le voci che nel lessico di input hanno già un blocco wiktionary con stato utile.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_json(path: Path, default: object) -> object:
|
||||
if not path.exists():
|
||||
return default
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def write_json(path: Path, payload: object) -> None:
|
||||
path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
|
||||
def parse_csv_set(value: str) -> set[str]:
|
||||
return {item.strip().lower() for item in str(value or "").split(",") if item.strip()}
|
||||
|
||||
|
||||
def entry_key(entry: Dict[str, object]) -> Tuple[str, str]:
|
||||
form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower()
|
||||
pos = str(entry.get("pos") or "").strip().upper()
|
||||
return form, pos
|
||||
|
||||
|
||||
def fetch_wikitext(title: str, api_url: str) -> Dict[str, object]:
|
||||
params = {
|
||||
"action": "query",
|
||||
"prop": "revisions",
|
||||
"titles": title,
|
||||
"rvprop": "content",
|
||||
"rvslots": "main",
|
||||
"formatversion": "2",
|
||||
"format": "json",
|
||||
}
|
||||
url = f"{api_url}?{urllib.parse.urlencode(params)}"
|
||||
request = urllib.request.Request(
|
||||
url,
|
||||
headers={
|
||||
"User-Agent": "cruciverba-alpha/0.1 (local lexical enrichment)",
|
||||
"Accept": "application/json",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(request, timeout=30) as response:
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
pages = ((payload.get("query") or {}).get("pages") or [])
|
||||
if not pages:
|
||||
return {"status": "missing"}
|
||||
page = pages[0]
|
||||
if page.get("missing"):
|
||||
return {"status": "missing", "title": page.get("title", title)}
|
||||
revisions = page.get("revisions") or []
|
||||
content = ""
|
||||
if revisions:
|
||||
slots = revisions[0].get("slots") or {}
|
||||
main_slot = slots.get("main") or {}
|
||||
content = str(main_slot.get("content") or "")
|
||||
return {
|
||||
"status": "ok" if content else "empty",
|
||||
"title": page.get("title", title),
|
||||
"pageid": page.get("pageid"),
|
||||
"wikitext": content,
|
||||
}
|
||||
|
||||
|
||||
def fetch_wikitext_with_retry(title: str, args: argparse.Namespace) -> Dict[str, object]:
|
||||
attempts = 0
|
||||
delay = max(1.0, float(args.backoff_429))
|
||||
while True:
|
||||
try:
|
||||
return fetch_wikitext(title, args.api_url)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code != 429:
|
||||
raise
|
||||
if args.stop_on_429:
|
||||
raise
|
||||
if attempts >= max(0, int(args.retry_429)):
|
||||
raise
|
||||
attempts += 1
|
||||
print(f"[429] {title}: attendo {delay:.1f}s prima del tentativo {attempts}/{args.retry_429}")
|
||||
time.sleep(delay)
|
||||
delay *= 2
|
||||
|
||||
|
||||
def normalize_heading(text: str) -> str:
|
||||
raw = str(text or "").strip().lower().replace(" ", "")
|
||||
if raw == "{{-it-}}":
|
||||
return "{{-it-}}"
|
||||
cleaned = strip_wikicode(text).strip().lower()
|
||||
return cleaned
|
||||
|
||||
|
||||
def extract_italian_section(wikitext: str) -> str:
|
||||
section_pattern = re.compile(r"^==\s*(.*?)\s*==\s*$", re.MULTILINE)
|
||||
matches = list(section_pattern.finditer(wikitext))
|
||||
for index, match in enumerate(matches):
|
||||
raw_heading = str(match.group(1) or "").strip().lower().replace(" ", "")
|
||||
heading = normalize_heading(match.group(1))
|
||||
if raw_heading == "{{-it-}}" or heading in {"italiano", "it"}:
|
||||
start = match.end()
|
||||
end = matches[index + 1].start() if index + 1 < len(matches) else len(wikitext)
|
||||
return wikitext[start:end]
|
||||
return ""
|
||||
|
||||
|
||||
def strip_templates(text: str) -> str:
|
||||
previous = None
|
||||
current = text
|
||||
while previous != current:
|
||||
previous = current
|
||||
current = re.sub(r"\{\{([^{}|]+)\|([^{}]+?)\}\}", r"\2", current)
|
||||
current = re.sub(r"\{\{[^{}]+\}\}", "", current)
|
||||
return current
|
||||
|
||||
|
||||
def strip_wikicode(text: str) -> str:
|
||||
value = str(text or "")
|
||||
value = re.sub(r"<!--.*?-->", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<ref[^>]*>.*?</ref>", " ", value, flags=re.DOTALL)
|
||||
value = re.sub(r"<[^>]+>", " ", value)
|
||||
value = strip_templates(value)
|
||||
value = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", value)
|
||||
value = re.sub(r"\[\[([^\]]+)\]\]", r"\1", value)
|
||||
value = value.replace("'''", "").replace("''", "")
|
||||
value = value.replace(" ", " ")
|
||||
value = re.sub(r"\s+", " ", value)
|
||||
return value.strip(" .;:-")
|
||||
|
||||
|
||||
def infer_topics(definitions: Sequence[str], categories: Sequence[str]) -> List[str]:
|
||||
text = " ".join(definitions + list(categories)).lower()
|
||||
topics = []
|
||||
for topic, keywords in TOPIC_KEYWORDS.items():
|
||||
if any(keyword in text for keyword in keywords):
|
||||
topics.append(topic)
|
||||
return sorted(set(topics))
|
||||
|
||||
|
||||
def infer_grammar_hints(definitions: Sequence[str], raw_section: str) -> List[str]:
|
||||
text = f"{' '.join(definitions)} {raw_section}".lower()
|
||||
hints = []
|
||||
for keyword in GRAMMAR_KEYWORDS:
|
||||
if keyword in text:
|
||||
hints.append(keyword)
|
||||
return sorted(set(hints))
|
||||
|
||||
|
||||
def detect_pos_from_heading(heading: str) -> Optional[str]:
|
||||
normalized = normalize_heading(heading)
|
||||
if not normalized:
|
||||
return None
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if label in normalized:
|
||||
return pos
|
||||
return None
|
||||
|
||||
|
||||
def parse_template_marker(line: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
stripped = line.strip()
|
||||
match = re.match(r"^\{\{-([^{}|]+?)-?(?:\|.*)?\}\}$", stripped, flags=re.IGNORECASE)
|
||||
if not match:
|
||||
return None, None
|
||||
marker = match.group(1).strip().lower()
|
||||
if marker == "it":
|
||||
return "language", "it"
|
||||
for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True):
|
||||
if marker.startswith(label):
|
||||
return "pos", pos
|
||||
if marker.startswith("sinon"):
|
||||
return "subsection", "sinonimi"
|
||||
if marker.startswith(("etim", "trad", "sill", "pron", "var", "note")):
|
||||
return "subsection", marker
|
||||
return "subsection", marker
|
||||
|
||||
|
||||
def parse_wiktionary_section(section_text: str) -> Dict[str, object]:
|
||||
lines = section_text.splitlines()
|
||||
entries: List[Dict[str, object]] = []
|
||||
categories: List[str] = []
|
||||
current: Optional[Dict[str, object]] = None
|
||||
current_subsection = ""
|
||||
|
||||
heading_pattern = re.compile(r"^(={3,4})\s*(.*?)\s*\1\s*$")
|
||||
|
||||
for raw_line in lines:
|
||||
line = raw_line.rstrip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
for category_match in re.findall(r"\[\[Categoria:([^\]]+)\]\]", line):
|
||||
categories.append(strip_wikicode(category_match))
|
||||
|
||||
marker_kind, marker_value = parse_template_marker(line)
|
||||
if marker_kind == "pos":
|
||||
current = {
|
||||
"pos": marker_value,
|
||||
"heading": marker_value,
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
if marker_kind == "subsection":
|
||||
current_subsection = str(marker_value or "")
|
||||
continue
|
||||
|
||||
heading_match = heading_pattern.match(line)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
heading = heading_match.group(2)
|
||||
if level == 3:
|
||||
pos = detect_pos_from_heading(heading)
|
||||
if pos:
|
||||
current = {
|
||||
"pos": pos,
|
||||
"heading": strip_wikicode(heading),
|
||||
"definitions": [],
|
||||
"examples": [],
|
||||
"synonyms": [],
|
||||
}
|
||||
entries.append(current)
|
||||
current_subsection = ""
|
||||
continue
|
||||
current_subsection = normalize_heading(heading)
|
||||
continue
|
||||
|
||||
if current is None:
|
||||
continue
|
||||
|
||||
stripped = line.lstrip()
|
||||
if stripped.startswith("#") and not stripped.startswith(("#:", "#*", "#;")):
|
||||
definition = strip_wikicode(stripped.lstrip("#").strip())
|
||||
if definition:
|
||||
current["definitions"].append(definition)
|
||||
continue
|
||||
|
||||
if stripped.startswith("#:") or stripped.startswith("#*"):
|
||||
example = strip_wikicode(stripped[2:].strip())
|
||||
if example:
|
||||
current["examples"].append(example)
|
||||
continue
|
||||
|
||||
if current_subsection.startswith("sinonim") and stripped.startswith("*"):
|
||||
synonym = strip_wikicode(stripped.lstrip("*").strip())
|
||||
if synonym:
|
||||
current["synonyms"].append(synonym)
|
||||
|
||||
flat_definitions = [definition for entry in entries for definition in entry["definitions"]]
|
||||
topic_hints = infer_topics(flat_definitions, categories)
|
||||
grammar_hints = infer_grammar_hints(flat_definitions, section_text)
|
||||
|
||||
return {
|
||||
"entries": entries,
|
||||
"categories": sorted(set(filter(None, categories))),
|
||||
"definitions": flat_definitions,
|
||||
"topic_hints": topic_hints,
|
||||
"grammar_hints": grammar_hints,
|
||||
}
|
||||
|
||||
|
||||
def wiktionary_payload_for_entry(entry: Dict[str, object], api_response: Dict[str, object]) -> Dict[str, object]:
|
||||
status = str(api_response.get("status", "missing"))
|
||||
if status != "ok":
|
||||
return {
|
||||
"status": status,
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
italian_section = extract_italian_section(str(api_response.get("wikitext") or ""))
|
||||
if not italian_section:
|
||||
return {
|
||||
"status": "no_italian_section",
|
||||
"matched": False,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": [],
|
||||
"entries": [],
|
||||
"topic_hints": [],
|
||||
"grammar_hints": [],
|
||||
"categories": [],
|
||||
}
|
||||
|
||||
parsed = parse_wiktionary_section(italian_section)
|
||||
matched = bool(parsed["definitions"])
|
||||
|
||||
return {
|
||||
"status": "enriched" if matched else "section_without_definitions",
|
||||
"matched": matched,
|
||||
"page_title": api_response.get("title") or entry.get("form"),
|
||||
"pageid": api_response.get("pageid"),
|
||||
"source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}",
|
||||
"definitions": parsed["definitions"],
|
||||
"entries": parsed["entries"],
|
||||
"topic_hints": parsed["topic_hints"],
|
||||
"grammar_hints": parsed["grammar_hints"],
|
||||
"categories": parsed["categories"],
|
||||
"raw_excerpt": italian_section[:4000],
|
||||
}
|
||||
|
||||
|
||||
def select_targets(
|
||||
refined_payload: Dict[str, object],
|
||||
review_payload: Dict[str, object],
|
||||
review_reasons: set[str],
|
||||
explicit_words: set[str],
|
||||
word_limit: int,
|
||||
skip_existing: bool,
|
||||
) -> Tuple[List[Dict[str, object]], int]:
|
||||
refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")}
|
||||
|
||||
if explicit_words:
|
||||
selected = []
|
||||
skipped_existing_count = 0
|
||||
for word in explicit_words:
|
||||
entry = refined_by_word.get(word)
|
||||
if entry is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(entry):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
selected.append(entry)
|
||||
selected = selected[:word_limit] if word_limit > 0 else selected
|
||||
return selected, skipped_existing_count
|
||||
|
||||
review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)]
|
||||
selected_words: List[str] = []
|
||||
seen = set()
|
||||
skipped_existing_count = 0
|
||||
|
||||
for review_entry in review_entries:
|
||||
word = str(review_entry.get("form", "")).strip().lower()
|
||||
if not word or word in seen:
|
||||
continue
|
||||
reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []}
|
||||
refined = refined_by_word.get(word)
|
||||
if refined is None:
|
||||
continue
|
||||
if skip_existing and wiktionary_already_useful(refined):
|
||||
skipped_existing_count += 1
|
||||
continue
|
||||
babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower()
|
||||
if reasons.intersection(review_reasons) or babelnet_status == "no_match":
|
||||
selected_words.append(word)
|
||||
seen.add(word)
|
||||
if word_limit > 0 and len(selected_words) >= word_limit:
|
||||
break
|
||||
|
||||
return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count
|
||||
|
||||
|
||||
def wiktionary_already_useful(entry: Dict[str, object]) -> bool:
|
||||
wiktionary = entry.get("wiktionary", {})
|
||||
if not isinstance(wiktionary, dict):
|
||||
return False
|
||||
status = str(wiktionary.get("status", "")).lower()
|
||||
if status == "enriched" and (wiktionary.get("definitions") or wiktionary.get("entries")):
|
||||
return True
|
||||
if status in {"missing", "no_italian_section", "section_without_definitions", "empty"}:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def enrich_from_wiktionary(args: argparse.Namespace) -> Dict[str, object]:
|
||||
refined_payload = load_json(args.input, {"entries": []})
|
||||
if not isinstance(refined_payload, dict) or "entries" not in refined_payload:
|
||||
raise ValueError(f"Lessico refined non valido: {args.input}")
|
||||
|
||||
review_payload = load_json(args.review, {"entries": []})
|
||||
if not isinstance(review_payload, dict):
|
||||
review_payload = {"entries": []}
|
||||
|
||||
cache = load_json(args.cache, {})
|
||||
if not isinstance(cache, dict):
|
||||
cache = {}
|
||||
|
||||
targets, skipped_existing_count = select_targets(
|
||||
refined_payload,
|
||||
review_payload,
|
||||
parse_csv_set(args.review_reasons),
|
||||
parse_csv_set(args.words),
|
||||
args.word_limit,
|
||||
args.skip_existing,
|
||||
)
|
||||
|
||||
enriched_entries = []
|
||||
cache_hits = 0
|
||||
network_calls = 0
|
||||
network_attempts = 0
|
||||
processed_count = 0
|
||||
stopped_reason = None
|
||||
stop_word = None
|
||||
|
||||
print(
|
||||
f"Target selezionati: {len(targets)}"
|
||||
+ (f" | già saltati per wiktionary esistente: {skipped_existing_count}" if args.skip_existing else "")
|
||||
)
|
||||
|
||||
def persist_progress() -> None:
|
||||
refined_index = {
|
||||
entry_key(entry): entry
|
||||
for entry in refined_payload.get("entries", []) or []
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
for item in enriched_entries:
|
||||
refined_index[entry_key(item)] = item
|
||||
|
||||
merged_entries = list(refined_index.values())
|
||||
merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", ""))))
|
||||
|
||||
merged_payload = {
|
||||
"meta": {
|
||||
**(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}),
|
||||
"wiktionary_source": args.api_url,
|
||||
"wiktionary_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"),
|
||||
"wiktionary_target_count": len(targets),
|
||||
"wiktionary_processed_count": processed_count,
|
||||
"wiktionary_skipped_existing_count": skipped_existing_count,
|
||||
"wiktionary_cache_hits": cache_hits,
|
||||
"wiktionary_network_calls": network_calls,
|
||||
"wiktionary_network_attempts": network_attempts,
|
||||
"wiktionary_stopped_reason": stopped_reason,
|
||||
"wiktionary_stop_word": stop_word,
|
||||
},
|
||||
"entries": merged_entries,
|
||||
}
|
||||
|
||||
write_json(args.cache, cache)
|
||||
write_json(args.output, merged_payload)
|
||||
|
||||
for index, entry in enumerate(targets, start=1):
|
||||
updated = deepcopy(entry)
|
||||
word = str(entry.get("form", "")).strip()
|
||||
cache_key = word.lower()
|
||||
|
||||
if cache_key in cache:
|
||||
api_response = cache[cache_key]
|
||||
cache_hits += 1
|
||||
else:
|
||||
try:
|
||||
network_attempts += 1
|
||||
api_response = fetch_wikitext_with_retry(word, args)
|
||||
except urllib.error.HTTPError as exc:
|
||||
if exc.code == 429:
|
||||
stop_word = word
|
||||
stopped_reason = f"http_429_after_{processed_count}_words"
|
||||
print(f"[STOP] Wiktionary ha risposto 429 su '{word}'. Salvo il progresso e interrompo il batch.")
|
||||
persist_progress()
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
raise
|
||||
cache[cache_key] = api_response
|
||||
network_calls += 1
|
||||
if args.sleep > 0:
|
||||
time.sleep(args.sleep)
|
||||
|
||||
updated["wiktionary"] = wiktionary_payload_for_entry(updated, api_response)
|
||||
updated["wiktionary_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
enriched_entries.append(updated)
|
||||
processed_count += 1
|
||||
print(
|
||||
f"[{index}/{len(targets)}] {word}: "
|
||||
f"status={updated['wiktionary'].get('status')} "
|
||||
f"def={len(updated['wiktionary'].get('definitions', []))} "
|
||||
f"topics={len(updated['wiktionary'].get('topic_hints', []))}"
|
||||
)
|
||||
if args.save_every > 0 and processed_count % int(args.save_every) == 0:
|
||||
persist_progress()
|
||||
print(f"[save] progresso salvato dopo {processed_count} parole")
|
||||
|
||||
persist_progress()
|
||||
|
||||
return {
|
||||
"target_count": len(targets),
|
||||
"processed_count": processed_count,
|
||||
"skipped_existing_count": skipped_existing_count,
|
||||
"cache_hits": cache_hits,
|
||||
"network_calls": network_calls,
|
||||
"network_attempts": network_attempts,
|
||||
"output": str(args.output),
|
||||
"stopped_reason": stopped_reason,
|
||||
"stop_word": stop_word,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
result = enrich_from_wiktionary(args)
|
||||
print(f"Lessico con Wiktionary generato: {result['output']}")
|
||||
print(f"Voci trattate: {result.get('processed_count', result['target_count'])}/{result['target_count']}")
|
||||
if "skipped_existing_count" in result:
|
||||
print(f"Voci già saltate: {result['skipped_existing_count']}")
|
||||
print(f"Cache hit: {result['cache_hits']}")
|
||||
print(f"Chiamate rete: {result['network_calls']}")
|
||||
if "network_attempts" in result:
|
||||
print(f"Tentativi di rete: {result['network_attempts']}")
|
||||
if result.get("stopped_reason"):
|
||||
print(f"Batch interrotto: {result['stopped_reason']}")
|
||||
if result.get("stop_word"):
|
||||
print(f"Ultima parola bloccante: {result['stop_word']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user