cruciverba_1/build_vocabulary.py

from __future__ import annotations

import json
import re
import unicodedata
from pathlib import Path
from typing import Dict, Iterable, List, Optional


PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
MIN_WORD_LENGTH = 2
MAX_WORD_LENGTH = 14

COMMON_FUNCTION_WORDS = {
    "a",
    "ad",
    "al",
    "allo",
    "ai",
    "agli",
    "alla",
    "alle",
    "con",
    "col",
    "coi",
    "da",
    "dal",
    "dallo",
    "dai",
    "dagli",
    "dalla",
    "dalle",
    "di",
    "del",
    "dello",
    "dei",
    "degli",
    "della",
    "delle",
    "e",
    "ed",
    "in",
    "nel",
    "nello",
    "nei",
    "negli",
    "nella",
    "nelle",
    "o",
    "od",
    "per",
    "su",
    "sul",
    "sullo",
    "sui",
    "sugli",
    "sulla",
    "sulle",
    "tra",
    "fra",
}

COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
COMMON_ADVERB_SUFFIXES = ("mente",)
COMMON_NOUN_SUFFIXES = (
    "zione",
    "zioni",
    "tore",
    "tori",
    "trice",
    "trici",
    "ista",
    "isti",
    "ismo",
    "ismi",
    "anza",
    "enze",
    "enza",
    "ezza",
    "ezze",
    "ita",
    "ore",
    "ori",
)
COMMON_ADJECTIVE_SUFFIXES = (
    "ale",
    "ali",
    "oso",
    "osa",
    "osi",
    "ose",
    "ivo",
    "iva",
    "ivi",
    "ive",
    "ente",
    "enti",
)


def maybe_fix_mojibake(text: str) -> str:
    if "Ã" not in text and "Â" not in text:
        return text
    try:
        return text.encode("latin-1").decode("utf-8")
    except (UnicodeEncodeError, UnicodeDecodeError):
        return text


def strip_accents(text: str) -> str:
    normalized = unicodedata.normalize("NFKD", text)
    return "".join(char for char in normalized if not unicodedata.combining(char))


def normalize_word(word: str) -> Optional[str]:
    clean = maybe_fix_mojibake(word.strip().lower())
    clean = clean.replace("’", "'").replace("`", "'")
    clean = strip_accents(clean)
    clean = clean.replace("'", "")
    clean = clean.replace("-", "")
    clean = clean.replace(" ", "")

    if len(clean) < MIN_WORD_LENGTH:
        return None
    if not re.fullmatch(r"[a-z]+", clean):
        return None
    return clean


def categorize_word(word: str) -> Dict[str, object]:
    tags: List[str] = []
    score = 0

    if word in COMMON_FUNCTION_WORDS:
        tags.append("function")
        score += 6

    if word.endswith(COMMON_VERB_SUFFIXES):
        tags.append("verb_infinitive")
        score += 4

    if word.endswith(COMMON_ADVERB_SUFFIXES):
        tags.append("adverb")
        score += 3

    if word.endswith(COMMON_NOUN_SUFFIXES):
        tags.append("noun_like")
        score += 3

    if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
        tags.append("adjective_like")
        score += 2

    if len(word) <= 4:
        tags.append("short")
        score += 2
    elif 5 <= len(word) <= 9:
        tags.append("medium")
        score += 3
    else:
        tags.append("long")
        score += 1

    if len(set(word)) >= max(4, len(word) // 2):
        tags.append("varied_letters")
        score += 2

    penalty = 0
    repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
    if repeated_run >= 3:
        tags.append("repetition_penalty")
        penalty += 3

    consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
    if consonant_clusters:
        tags.append("cluster_penalty")
        penalty += 2

    if len(word) > MAX_WORD_LENGTH:
        tags.append("too_long")
        penalty += 6

    quality = score - penalty
    return {"tags": sorted(set(tags)), "quality": quality}


def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
    tags = set(meta["tags"])
    quality = int(meta["quality"])

    if "too_long" in tags:
        return False
    if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
        return False
    if quality < 2:
        return False
    return True


def extract_words(raw_words: Iterable[str]) -> List[str]:
    normalized = set()
    for word in raw_words:
        clean = normalize_word(word)
        if clean is not None:
            normalized.add(clean)
    return sorted(normalized)


def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
    payload = json.loads(source_path.read_text(encoding="utf-8"))
    words = extract_words(payload.keys())
    output_path.write_text("\n".join(words) + "\n", encoding="utf-8")

    metadata = {word: categorize_word(word) for word in words}
    filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]

    FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
    METADATA_OUTPUT_PATH.write_text(
        json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
        encoding="utf-8",
    )
    return {
        "extended_words": len(words),
        "filtered_words": len(filtered_words),
        "metadata_entries": len(metadata),
    }


def main() -> None:
    totals = build_vocabulary()
    print(f"Vocabolario esteso: {OUTPUT_PATH}")
    print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
    print(f"Metadati: {METADATA_OUTPUT_PATH}")
    print(f"Parole estese: {totals['extended_words']}")
    print(f"Parole filtrate: {totals['filtered_words']}")
    print(f"Metadati generati: {totals['metadata_entries']}")


if __name__ == "__main__":
    main()