from __future__ import annotations import argparse import json import re import time import urllib.parse import urllib.request import urllib.error from copy import deepcopy from datetime import datetime from pathlib import Path from typing import Dict, Iterable, List, Optional, Sequence, Tuple from refine_lexicon_topics import REFINED_LEXICON_OUTPUT_PATH REVIEW_INPUT_PATH = Path(__file__).with_name("to_be_review.json") WIKTIONARY_CACHE_PATH = Path(__file__).with_name(".wiktionary_cache.json") WIKTIONARY_OUTPUT_PATH = Path(__file__).with_name("lexicon_it_refined_plus_wiktionary.json") WIKTIONARY_API_URL = "https://it.wiktionary.org/w/api.php" DEFAULT_REVIEW_REASONS = {"no_viable_definition", "only_general_topics", "babelnet_ambiguous"} POS_ALIASES = { "sostantivo": "NOUN", "nome": "NOUN", "sost": "NOUN", "aggettivo": "ADJ", "agg": "ADJ", "verbo": "VERB", "verb": "VERB", "verb form": "VERB_FORM", "avverbio": "ADV", "avv": "ADV", "preposizione": "PREP", "prep": "PREP", "congiunzione": "CONJ", "cong": "CONJ", "pronome": "PRON", "pron": "PRON", "articolo": "ART", "interiezione": "INTJ", "inter": "INTJ", "locuzione": "PHRASE", "loc": "PHRASE", } TOPIC_KEYWORDS = { "religion": ("religione", "cattolic", "sacro", "sacra", "devozion", "scapolare", "abbazia", "monastero"), "clothing": ("abito", "vestito", "vestit", "abbigliamento", "indumento", "stoffa"), "grammar": ("diminutivo", "voce verbale", "congiuntivo", "plurale", "singolare", "grammatica", "verbo"), "geography": ("comune", "paese", "regione", "provincia", "citta", "localita", "frazione"), "transport": ("veicolo", "motore", "treno", "aereo", "trasporto", "nave", "imbarcazione"), "health": ("medicina", "ospedale", "malattia", "cura", "feriti", "ammalati", "sanitario"), } GRAMMAR_KEYWORDS = ( "diminutivo", "accrescitivo", "peggiorativo", "alterato", "voce verbale", "congiuntivo", "participio", "plurale", "singolare", "maschile", "femminile", ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description=( "Arricchisce le voci problematiche del lessico refined con definizioni e metadati " "estratti da it.wiktionary.org." ) ) parser.add_argument( "--input", type=Path, default=REFINED_LEXICON_OUTPUT_PATH, help="Lessico refined di partenza.", ) parser.add_argument( "--review", type=Path, default=REVIEW_INPUT_PATH, help="File to_be_review.json da usare per selezionare le voci prioritarie.", ) parser.add_argument( "--output", type=Path, default=WIKTIONARY_OUTPUT_PATH, help="Nuovo lessico con blocco wiktionary aggiunto.", ) parser.add_argument( "--cache", type=Path, default=WIKTIONARY_CACHE_PATH, help="Cache locale delle risposte Wiktionary.", ) parser.add_argument( "--word-limit", type=int, default=0, help="Limite massimo di parole da elaborare. 0 = tutte le candidate.", ) parser.add_argument( "--sleep", type=float, default=1.0, help="Pausa tra le richieste HTTP a Wiktionary.", ) parser.add_argument( "--save-every", type=int, default=25, help="Salva cache e output ogni N parole elaborate per non perdere progresso.", ) parser.add_argument( "--retry-429", type=int, default=3, help="Numero massimo di tentativi aggiuntivi se Wiktionary risponde HTTP 429.", ) parser.add_argument( "--backoff-429", type=float, default=30.0, help="Secondi di attesa iniziali dopo un HTTP 429; raddoppiano a ogni nuovo tentativo.", ) parser.add_argument( "--stop-on-429", action="store_true", help="Se attivo, al primo HTTP 429 salva lo stato e interrompe il batch senza altri tentativi.", ) parser.add_argument( "--words", default="", help="Lista separata da virgole di lemmi specifici da arricchire.", ) parser.add_argument( "--review-reasons", default=",".join(sorted(DEFAULT_REVIEW_REASONS)), help="Motivi del file review da trattare con priorita, separati da virgole.", ) parser.add_argument( "--api-url", default=WIKTIONARY_API_URL, help="Endpoint MediaWiki Action API di Wiktionary.", ) parser.add_argument( "--skip-existing", action="store_true", help="Salta le voci che nel lessico di input hanno già un blocco wiktionary con stato utile.", ) return parser.parse_args() def load_json(path: Path, default: object) -> object: if not path.exists(): return default return json.loads(path.read_text(encoding="utf-8")) def write_json(path: Path, payload: object) -> None: path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") def parse_csv_set(value: str) -> set[str]: return {item.strip().lower() for item in str(value or "").split(",") if item.strip()} def entry_key(entry: Dict[str, object]) -> Tuple[str, str]: form = str(entry.get("normalized_form") or entry.get("form") or "").strip().lower() pos = str(entry.get("pos") or "").strip().upper() return form, pos def fetch_wikitext(title: str, api_url: str) -> Dict[str, object]: params = { "action": "query", "prop": "revisions", "titles": title, "rvprop": "content", "rvslots": "main", "formatversion": "2", "format": "json", } url = f"{api_url}?{urllib.parse.urlencode(params)}" request = urllib.request.Request( url, headers={ "User-Agent": "cruciverba-alpha/0.1 (local lexical enrichment)", "Accept": "application/json", }, ) with urllib.request.urlopen(request, timeout=30) as response: payload = json.loads(response.read().decode("utf-8")) pages = ((payload.get("query") or {}).get("pages") or []) if not pages: return {"status": "missing"} page = pages[0] if page.get("missing"): return {"status": "missing", "title": page.get("title", title)} revisions = page.get("revisions") or [] content = "" if revisions: slots = revisions[0].get("slots") or {} main_slot = slots.get("main") or {} content = str(main_slot.get("content") or "") return { "status": "ok" if content else "empty", "title": page.get("title", title), "pageid": page.get("pageid"), "wikitext": content, } def fetch_wikitext_with_retry(title: str, args: argparse.Namespace) -> Dict[str, object]: attempts = 0 delay = max(1.0, float(args.backoff_429)) while True: try: return fetch_wikitext(title, args.api_url) except urllib.error.HTTPError as exc: if exc.code != 429: raise if args.stop_on_429: raise if attempts >= max(0, int(args.retry_429)): raise attempts += 1 print(f"[429] {title}: attendo {delay:.1f}s prima del tentativo {attempts}/{args.retry_429}") time.sleep(delay) delay *= 2 def normalize_heading(text: str) -> str: raw = str(text or "").strip().lower().replace(" ", "") if raw == "{{-it-}}": return "{{-it-}}" cleaned = strip_wikicode(text).strip().lower() return cleaned def extract_italian_section(wikitext: str) -> str: section_pattern = re.compile(r"^==\s*(.*?)\s*==\s*$", re.MULTILINE) matches = list(section_pattern.finditer(wikitext)) for index, match in enumerate(matches): raw_heading = str(match.group(1) or "").strip().lower().replace(" ", "") heading = normalize_heading(match.group(1)) if raw_heading == "{{-it-}}" or heading in {"italiano", "it"}: start = match.end() end = matches[index + 1].start() if index + 1 < len(matches) else len(wikitext) return wikitext[start:end] return "" def strip_templates(text: str) -> str: previous = None current = text while previous != current: previous = current current = re.sub(r"\{\{([^{}|]+)\|([^{}]+?)\}\}", r"\2", current) current = re.sub(r"\{\{[^{}]+\}\}", "", current) return current def strip_wikicode(text: str) -> str: value = str(text or "") value = re.sub(r"", " ", value, flags=re.DOTALL) value = re.sub(r"]*>.*?", " ", value, flags=re.DOTALL) value = re.sub(r"<[^>]+>", " ", value) value = strip_templates(value) value = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", value) value = re.sub(r"\[\[([^\]]+)\]\]", r"\1", value) value = value.replace("'''", "").replace("''", "") value = value.replace(" ", " ") value = re.sub(r"\s+", " ", value) return value.strip(" .;:-") def infer_topics(definitions: Sequence[str], categories: Sequence[str]) -> List[str]: text = " ".join(definitions + list(categories)).lower() topics = [] for topic, keywords in TOPIC_KEYWORDS.items(): if any(keyword in text for keyword in keywords): topics.append(topic) return sorted(set(topics)) def infer_grammar_hints(definitions: Sequence[str], raw_section: str) -> List[str]: text = f"{' '.join(definitions)} {raw_section}".lower() hints = [] for keyword in GRAMMAR_KEYWORDS: if keyword in text: hints.append(keyword) return sorted(set(hints)) def detect_pos_from_heading(heading: str) -> Optional[str]: normalized = normalize_heading(heading) if not normalized: return None for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True): if label in normalized: return pos return None def parse_template_marker(line: str) -> Tuple[Optional[str], Optional[str]]: stripped = line.strip() match = re.match(r"^\{\{-([^{}|]+?)-?(?:\|.*)?\}\}$", stripped, flags=re.IGNORECASE) if not match: return None, None marker = match.group(1).strip().lower() if marker == "it": return "language", "it" for label, pos in sorted(POS_ALIASES.items(), key=lambda item: len(item[0]), reverse=True): if marker.startswith(label): return "pos", pos if marker.startswith("sinon"): return "subsection", "sinonimi" if marker.startswith(("etim", "trad", "sill", "pron", "var", "note")): return "subsection", marker return "subsection", marker def parse_wiktionary_section(section_text: str) -> Dict[str, object]: lines = section_text.splitlines() entries: List[Dict[str, object]] = [] categories: List[str] = [] current: Optional[Dict[str, object]] = None current_subsection = "" heading_pattern = re.compile(r"^(={3,4})\s*(.*?)\s*\1\s*$") for raw_line in lines: line = raw_line.rstrip() if not line: continue for category_match in re.findall(r"\[\[Categoria:([^\]]+)\]\]", line): categories.append(strip_wikicode(category_match)) marker_kind, marker_value = parse_template_marker(line) if marker_kind == "pos": current = { "pos": marker_value, "heading": marker_value, "definitions": [], "examples": [], "synonyms": [], } entries.append(current) current_subsection = "" continue if marker_kind == "subsection": current_subsection = str(marker_value or "") continue heading_match = heading_pattern.match(line) if heading_match: level = len(heading_match.group(1)) heading = heading_match.group(2) if level == 3: pos = detect_pos_from_heading(heading) if pos: current = { "pos": pos, "heading": strip_wikicode(heading), "definitions": [], "examples": [], "synonyms": [], } entries.append(current) current_subsection = "" continue current_subsection = normalize_heading(heading) continue if current is None: continue stripped = line.lstrip() if stripped.startswith("#") and not stripped.startswith(("#:", "#*", "#;")): definition = strip_wikicode(stripped.lstrip("#").strip()) if definition: current["definitions"].append(definition) continue if stripped.startswith("#:") or stripped.startswith("#*"): example = strip_wikicode(stripped[2:].strip()) if example: current["examples"].append(example) continue if current_subsection.startswith("sinonim") and stripped.startswith("*"): synonym = strip_wikicode(stripped.lstrip("*").strip()) if synonym: current["synonyms"].append(synonym) flat_definitions = [definition for entry in entries for definition in entry["definitions"]] topic_hints = infer_topics(flat_definitions, categories) grammar_hints = infer_grammar_hints(flat_definitions, section_text) return { "entries": entries, "categories": sorted(set(filter(None, categories))), "definitions": flat_definitions, "topic_hints": topic_hints, "grammar_hints": grammar_hints, } def wiktionary_payload_for_entry(entry: Dict[str, object], api_response: Dict[str, object]) -> Dict[str, object]: status = str(api_response.get("status", "missing")) if status != "ok": return { "status": status, "matched": False, "page_title": api_response.get("title") or entry.get("form"), "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(entry.get('form', '')))}", "definitions": [], "entries": [], "topic_hints": [], "grammar_hints": [], "categories": [], } italian_section = extract_italian_section(str(api_response.get("wikitext") or "")) if not italian_section: return { "status": "no_italian_section", "matched": False, "page_title": api_response.get("title") or entry.get("form"), "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}", "definitions": [], "entries": [], "topic_hints": [], "grammar_hints": [], "categories": [], } parsed = parse_wiktionary_section(italian_section) matched = bool(parsed["definitions"]) return { "status": "enriched" if matched else "section_without_definitions", "matched": matched, "page_title": api_response.get("title") or entry.get("form"), "pageid": api_response.get("pageid"), "source_url": f"https://it.wiktionary.org/wiki/{urllib.parse.quote(str(api_response.get('title') or entry.get('form', '')))}", "definitions": parsed["definitions"], "entries": parsed["entries"], "topic_hints": parsed["topic_hints"], "grammar_hints": parsed["grammar_hints"], "categories": parsed["categories"], "raw_excerpt": italian_section[:4000], } def select_targets( refined_payload: Dict[str, object], review_payload: Dict[str, object], review_reasons: set[str], explicit_words: set[str], word_limit: int, skip_existing: bool, ) -> Tuple[List[Dict[str, object]], int]: refined_entries = [entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict)] refined_by_word = {str(entry.get("form", "")).lower(): entry for entry in refined_entries if entry.get("form")} if explicit_words: selected = [] skipped_existing_count = 0 for word in explicit_words: entry = refined_by_word.get(word) if entry is None: continue if skip_existing and wiktionary_already_useful(entry): skipped_existing_count += 1 continue selected.append(entry) selected = selected[:word_limit] if word_limit > 0 else selected return selected, skipped_existing_count review_entries = [entry for entry in review_payload.get("entries", []) or [] if isinstance(entry, dict)] selected_words: List[str] = [] seen = set() skipped_existing_count = 0 for review_entry in review_entries: word = str(review_entry.get("form", "")).strip().lower() if not word or word in seen: continue reasons = {str(item).lower() for item in review_entry.get("review_reasons", []) or []} refined = refined_by_word.get(word) if refined is None: continue if skip_existing and wiktionary_already_useful(refined): skipped_existing_count += 1 continue babelnet_status = str((refined.get("babelnet") or {}).get("status", "")).lower() if reasons.intersection(review_reasons) or babelnet_status == "no_match": selected_words.append(word) seen.add(word) if word_limit > 0 and len(selected_words) >= word_limit: break return [refined_by_word[word] for word in selected_words if word in refined_by_word], skipped_existing_count def wiktionary_already_useful(entry: Dict[str, object]) -> bool: wiktionary = entry.get("wiktionary", {}) if not isinstance(wiktionary, dict): return False status = str(wiktionary.get("status", "")).lower() if status == "enriched" and (wiktionary.get("definitions") or wiktionary.get("entries")): return True if status in {"missing", "no_italian_section", "section_without_definitions", "empty"}: return True return False def enrich_from_wiktionary(args: argparse.Namespace) -> Dict[str, object]: refined_payload = load_json(args.input, {"entries": []}) if not isinstance(refined_payload, dict) or "entries" not in refined_payload: raise ValueError(f"Lessico refined non valido: {args.input}") review_payload = load_json(args.review, {"entries": []}) if not isinstance(review_payload, dict): review_payload = {"entries": []} cache = load_json(args.cache, {}) if not isinstance(cache, dict): cache = {} targets, skipped_existing_count = select_targets( refined_payload, review_payload, parse_csv_set(args.review_reasons), parse_csv_set(args.words), args.word_limit, args.skip_existing, ) enriched_entries = [] cache_hits = 0 network_calls = 0 network_attempts = 0 processed_count = 0 stopped_reason = None stop_word = None print( f"Target selezionati: {len(targets)}" + (f" | già saltati per wiktionary esistente: {skipped_existing_count}" if args.skip_existing else "") ) def persist_progress() -> None: refined_index = { entry_key(entry): entry for entry in refined_payload.get("entries", []) or [] if isinstance(entry, dict) } for item in enriched_entries: refined_index[entry_key(item)] = item merged_entries = list(refined_index.values()) merged_entries.sort(key=lambda item: (str(item.get("normalized_form", "")), str(item.get("pos", "")))) merged_payload = { "meta": { **(refined_payload.get("meta", {}) if isinstance(refined_payload.get("meta"), dict) else {}), "wiktionary_source": args.api_url, "wiktionary_generated_at": datetime.now().astimezone().isoformat(timespec="seconds"), "wiktionary_target_count": len(targets), "wiktionary_processed_count": processed_count, "wiktionary_skipped_existing_count": skipped_existing_count, "wiktionary_cache_hits": cache_hits, "wiktionary_network_calls": network_calls, "wiktionary_network_attempts": network_attempts, "wiktionary_stopped_reason": stopped_reason, "wiktionary_stop_word": stop_word, }, "entries": merged_entries, } write_json(args.cache, cache) write_json(args.output, merged_payload) for index, entry in enumerate(targets, start=1): updated = deepcopy(entry) word = str(entry.get("form", "")).strip() cache_key = word.lower() if cache_key in cache: api_response = cache[cache_key] cache_hits += 1 else: try: network_attempts += 1 api_response = fetch_wikitext_with_retry(word, args) except urllib.error.HTTPError as exc: if exc.code == 429: stop_word = word stopped_reason = f"http_429_after_{processed_count}_words" print(f"[STOP] Wiktionary ha risposto 429 su '{word}'. Salvo il progresso e interrompo il batch.") persist_progress() return { "target_count": len(targets), "processed_count": processed_count, "skipped_existing_count": skipped_existing_count, "cache_hits": cache_hits, "network_calls": network_calls, "network_attempts": network_attempts, "output": str(args.output), "stopped_reason": stopped_reason, "stop_word": stop_word, } raise cache[cache_key] = api_response network_calls += 1 if args.sleep > 0: time.sleep(args.sleep) updated["wiktionary"] = wiktionary_payload_for_entry(updated, api_response) updated["wiktionary_generated_at"] = datetime.now().astimezone().isoformat(timespec="seconds") enriched_entries.append(updated) processed_count += 1 print( f"[{index}/{len(targets)}] {word}: " f"status={updated['wiktionary'].get('status')} " f"def={len(updated['wiktionary'].get('definitions', []))} " f"topics={len(updated['wiktionary'].get('topic_hints', []))}" ) if args.save_every > 0 and processed_count % int(args.save_every) == 0: persist_progress() print(f"[save] progresso salvato dopo {processed_count} parole") persist_progress() return { "target_count": len(targets), "processed_count": processed_count, "skipped_existing_count": skipped_existing_count, "cache_hits": cache_hits, "network_calls": network_calls, "network_attempts": network_attempts, "output": str(args.output), "stopped_reason": stopped_reason, "stop_word": stop_word, } def main() -> None: args = parse_args() result = enrich_from_wiktionary(args) print(f"Lessico con Wiktionary generato: {result['output']}") print(f"Voci trattate: {result.get('processed_count', result['target_count'])}/{result['target_count']}") if "skipped_existing_count" in result: print(f"Voci già saltate: {result['skipped_existing_count']}") print(f"Cache hit: {result['cache_hits']}") print(f"Chiamate rete: {result['network_calls']}") if "network_attempts" in result: print(f"Tentativi di rete: {result['network_attempts']}") if result.get("stopped_reason"): print(f"Batch interrotto: {result['stopped_reason']}") if result.get("stop_word"): print(f"Ultima parola bloccante: {result['stop_word']}") if __name__ == "__main__": main()