cruciverba_1/main.py

from __future__ import annotations

import argparse
import json
from pathlib import Path
from typing import Dict, List

from build_vocabulary import (
    FILTERED_OUTPUT_PATH,
    METADATA_OUTPUT_PATH,
    OUTPUT_PATH,
    build_vocabulary,
)
from build_lexicon import LEXICON_OUTPUT_PATH, build_lexicon
from build_semantic_lexicon import SEMANTIC_LEXICON_OUTPUT_PATH, build_semantic_lexicon
from crossword_filler import CrosswordFiller, load_vocabulary, load_vocabulary_metadata
from crossword_generator import CrosswordGenerator, WORDS, render_grid


DIFFICULTY_ALIASES: Dict[str, int] = {
    "easy": 1,
    "medium": 2,
    "hard": 4,
    "expert": 5,
}

DEFAULT_TOPIC = "general"
DEFAULT_INITIAL_WORD_COUNT = len(WORDS)
ABSTRACTISH_SUFFIXES = ("zione", "zioni", "mento", "menti", "ita", "ezza", "anza", "enza", "ismo")
FILL_ALLOWED_POS = {"NOUN", "VERB", "ADJ", "ADV", "PREP", "CONJ"}
GENERAL_FILL_MIN_QUALITY = 6
GENERAL_FILL_MAX_LENGTH = 10
SOFT_RELATED_FILL_LIMIT = 120
DEFAULT_THEMED_FILL_WORD_COUNT = 10
CONCRETE_TOPICS = {
    "animals",
    "plants",
    "nature",
    "ecology",
    "geography",
    "weather",
    "sea",
    "mountain",
    "health",
    "science",
    "sport",
    "history",
    "school",
    "cinema",
    "literature",
    "food",
    "city",
    "transport",
    "work",
    "home",
}

TOPIC_SEED_REQUIRED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
    "transport": (
        "auto", "mot", "tren", "nav", "barc", "port", "pist", "vol", "aer",
        "bici", "cicl", "rimorch", "reattor", "vettur", "ambul", "imbarc",
        "trattor", "carr", "vap", "rota", "ruot",
    ),
    "animals": (
        "can", "gatt", "lup", "ors", "pesc", "aquil", "anatr", "cavall",
        "serpent", "tig", "leon", "volp", "cerv", "capr", "pecor",
    ),
    "nature": (
        "mar", "lag", "fium", "vent", "bosch", "mont", "collin", "isol",
        "rocc", "terra", "acqu", "fiore", "fogli", "radic", "affluent",
        "litoral", "piogg", "nev", "onda", "clim",
    ),
    "cinema": (
        "film", "cin", "teatr", "attor", "scen", "reg", "doppi", "dialog",
        "comic", "div", "docu", "pellic", "spettacol",
    ),
}

TOPIC_SEED_BLOCKED_SUBSTRINGS: Dict[str, tuple[str, ...]] = {
    "transport": (
        "intervist", "intratten", "speriment", "stermin", "investig",
        "intervent", "centometr", "sintetizz", "erot", "adoraz", "esalt",
        "eccit", "traduz", "fluttu", "sollecit",
    ),
    "animals": (
        "assicur", "finanz", "coediz", "camerier", "servitor", "indic",
        "estens", "diffus", "difensor", "spessor", "maggior",
    ),
    "cinema": (
        "manifest", "riediz", "dissimul", "diffus", "difensor", "estens",
        "malumor", "eversor",
    ),
}


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Generatore e filler di cruciverba.")
    parser.add_argument(
        "--build-vocabulary",
        action="store_true",
        help="Rigenera i file lessicali intermedi: vocabolario esteso, filtrato e metadati.",
    )
    parser.add_argument(
        "--build-lexicon",
        action="store_true",
        help="Rigenera `lexicon_it.json` prima dell'esecuzione.",
    )
    parser.add_argument(
        "--skip-fill",
        action="store_true",
        help="Genera solo la griglia iniziale e salta il riempimento con il filler.",
    )
    parser.add_argument(
        "--build-semantic-lexicon",
        action="store_true",
        help="Rigenera `lexicon_it_semantic.json` arricchendo il lessico con IWN-OMW/ItalWordNet.",
    )
    parser.add_argument(
        "--vocabulary",
        type=Path,
        default=None,
        help="Percorso opzionale a un vocabolario testuale personalizzato da usare al posto di quello di default.",
    )
    parser.add_argument(
        "--target-empty-ratio",
        type=float,
        default=1 / 6,
        help="Rapporto target di celle vuote residue dopo il filler. Esempio: 0.1667 lascia circa un sesto di celle vuote.",
    )
    parser.add_argument(
        "--time-limit",
        type=float,
        default=8.0,
        help="Tempo massimo in secondi per la fase di generazione iniziale della griglia.",
    )
    parser.add_argument(
        "--max-candidates",
        type=int,
        default=12,
        help="Numero massimo di candidati esplorati per parola nella generazione iniziale.",
    )
    parser.add_argument(
        "--diffxy",
        type=int,
        default=7,
        help="Differenza massima preferita tra larghezza e altezza della griglia iniziale.",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="Seed casuale per ottenere varianti riproducibili del cruciverba: stesso seed, stesso risultato.",
    )
    parser.add_argument(
        "--difficulty",
        default="medium",
        help="Difficolta lessicale del filler. Alias testuali: easy, medium, hard, expert. Internamente mappati a livelli numerici 1-5.",
    )
    parser.add_argument(
        "--topic",
        default=DEFAULT_TOPIC,
        help="Tema del cruciverba. Attualmente supporta i topic presenti nel lessico, ad esempio: general, nature, animals, actions, abstract.",
    )
    parser.add_argument(
        "--initial-word-count",
        type=int,
        default=DEFAULT_INITIAL_WORD_COUNT,
        help="Numero di parole-seme usate per costruire la griglia iniziale prima del filler.",
    )
    parser.add_argument(
        "--themed-fill-count",
        type=int,
        default=DEFAULT_THEMED_FILL_WORD_COUNT,
        help="Numero massimo indicativo di parole aggiunte dal filler da mantenere fortemente legate al tema.",
    )
    return parser.parse_args()


def ensure_vocabulary(args: argparse.Namespace) -> None:
    needs_build = args.build_vocabulary or not FILTERED_OUTPUT_PATH.exists() or not METADATA_OUTPUT_PATH.exists()
    if not needs_build:
        return

    totals = build_vocabulary()
    print("Vocabolario rigenerato")
    print(f"- esteso: {OUTPUT_PATH}")
    print(f"- filtrato: {FILTERED_OUTPUT_PATH}")
    print(f"- metadati: {METADATA_OUTPUT_PATH}")
    print(f"- parole estese: {totals['extended_words']}")
    print(f"- parole filtrate: {totals['filtered_words']}")


def ensure_lexicon(args: argparse.Namespace) -> None:
    needs_build = args.build_lexicon or not LEXICON_OUTPUT_PATH.exists()
    if not needs_build:
        return

    lexicon = build_lexicon()
    LEXICON_OUTPUT_PATH.write_text(
        json.dumps(lexicon, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    print("Lessico rigenerato")
    print(f"- file: {LEXICON_OUTPUT_PATH}")
    print(f"- voci: {lexicon['meta']['entry_count']}")


def ensure_semantic_lexicon(args: argparse.Namespace) -> None:
    needs_build = args.build_semantic_lexicon or not SEMANTIC_LEXICON_OUTPUT_PATH.exists()
    if not needs_build:
        return

    lexicon = build_semantic_lexicon()
    SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
        json.dumps(lexicon, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )
    matched = sum(1 for entry in lexicon["entries"] if entry.get("semantic", {}).get("matched"))
    print("Lessico semantico rigenerato")
    print(f"- file: {SEMANTIC_LEXICON_OUTPUT_PATH}")
    print(f"- voci: {lexicon['meta']['entry_count']}")
    print(f"- match semantici: {matched}")


def parse_difficulty(value: str) -> int:
    text = str(value).strip().lower()
    if text in DIFFICULTY_ALIASES:
        return DIFFICULTY_ALIASES[text]
    try:
        level = int(text)
    except ValueError as exc:
        raise SystemExit(
            "Valore non valido per --difficulty. Usa easy, medium, hard, expert oppure un intero tra 1 e 5."
        ) from exc
    if not 1 <= level <= 5:
        raise SystemExit("Il valore numerico di --difficulty deve essere compreso tra 1 e 5.")
    return level


def load_selected_vocabulary(path: Path | None) -> List[str]:
    if path is None:
        return load_vocabulary()
    return path.read_text(encoding="utf-8").splitlines()


def load_semantic_payload() -> Dict[str, object]:
    if not SEMANTIC_LEXICON_OUTPUT_PATH.exists():
        lexicon = build_semantic_lexicon()
        SEMANTIC_LEXICON_OUTPUT_PATH.write_text(
            json.dumps(lexicon, ensure_ascii=False, indent=2),
            encoding="utf-8",
        )
    return json.loads(SEMANTIC_LEXICON_OUTPUT_PATH.read_text(encoding="utf-8"))


def entry_topics(entry: Dict[str, object]) -> tuple[set[str], set[str]]:
    topics = {str(item).lower() for item in entry.get("topics", [])}
    semantic_topics = {
        str(item).lower()
        for item in entry.get("semantic", {}).get("semantic_topics", [])
    }
    return topics, semantic_topics


def matches_topic_roots(word: str, selected_topic: str) -> bool:
    roots = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic, ())
    blocked = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
    if any(part in word for part in blocked):
        return False
    return bool(roots) and any(part in word for part in roots)


def topic_relevance(entry: Dict[str, object], topic: str) -> int:
    selected_topic = topic.strip().lower()
    if selected_topic == DEFAULT_TOPIC:
        return 20

    word = str(entry.get("form", ""))
    topics, semantic_topics = entry_topics(entry)
    score = 0
    if selected_topic in topics:
        score += 100
    if selected_topic in semantic_topics:
        score += 45
    if matches_topic_roots(word, selected_topic):
        score += 35
    if "general" in topics:
        score += 5

    if any(part in word for part in TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())):
        score -= 80
    if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
        score -= 15
    return score


def strong_topic_relevance(entry: Dict[str, object], topic: str) -> int:
    selected_topic = topic.strip().lower()
    if selected_topic == DEFAULT_TOPIC:
        return 20
    topics, _ = entry_topics(entry)
    return 100 if selected_topic in topics else 0


def lexical_fill_score(entry: Dict[str, object], topic: str) -> tuple[int, int, int, int, int, str]:
    word = str(entry.get("form", ""))
    quality = int(entry.get("quality_score", 0))
    pos = str(entry.get("pos", ""))
    semantic = entry.get("semantic", {})
    pos_bonus = {
        "NOUN": 12,
        "VERB": 8,
        "ADJ": 6,
        "ADV": 4,
        "PREP": 2,
        "CONJ": 2,
    }.get(pos, 0)
    semantic_bonus = 3 if semantic.get("matched") else 0
    length = len(word)
    length_bonus = 3 if 4 <= length <= 10 else 1 if 2 <= length <= 13 else -4
    return (
        topic_relevance(entry, topic),
        quality,
        pos_bonus,
        semantic_bonus,
        length_bonus,
        word,
    )


def is_general_fill_support(entry: Dict[str, object]) -> bool:
    word = str(entry.get("form", ""))
    if int(entry.get("quality_score", 0)) < GENERAL_FILL_MIN_QUALITY:
        return False
    if len(word) > GENERAL_FILL_MAX_LENGTH:
        return False
    if word.endswith(ABSTRACTISH_SUFFIXES):
        return False
    return DEFAULT_TOPIC in {str(item).lower() for item in entry.get("topics", [])}


def load_filtered_entries(level: int, topic: str) -> List[Dict[str, object]]:
    payload = load_semantic_payload()
    normalized_topic = topic.strip().lower()

    eligible = [
        entry
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
        and str(entry.get("pos", "")) in FILL_ALLOWED_POS
    ]

    if normalized_topic == DEFAULT_TOPIC:
        selected = eligible
    else:
        strong_topic = [entry for entry in eligible if strong_topic_relevance(entry, normalized_topic) > 0]
        soft_related = [
            entry
            for entry in eligible
            if entry not in strong_topic
            and topic_relevance(entry, normalized_topic) > 0
            and int(entry.get("quality_score", 0)) >= GENERAL_FILL_MIN_QUALITY
            and len(str(entry.get("form", ""))) <= GENERAL_FILL_MAX_LENGTH
            and not str(entry.get("form", "")).endswith(ABSTRACTISH_SUFFIXES)
        ]
        soft_related.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)

        general_support = [
            entry
            for entry in eligible
            if entry not in strong_topic
            and is_general_fill_support(entry)
        ]
        general_support.sort(key=lambda entry: lexical_fill_score(entry, DEFAULT_TOPIC), reverse=True)
        selected = strong_topic + soft_related[:SOFT_RELATED_FILL_LIMIT]
        selected += [entry for entry in general_support if entry not in selected]

    selected.sort(key=lambda entry: lexical_fill_score(entry, normalized_topic), reverse=True)
    return selected


def load_filtered_vocabulary(level: int, topic: str) -> List[str]:
    return [str(entry["form"]) for entry in load_filtered_entries(level, topic)]


def load_semantic_metadata_for_vocabulary(words: List[str], topic: str) -> Dict[str, Dict[str, object]]:
    payload = load_semantic_payload()
    selected = set(words)
    metadata: Dict[str, Dict[str, object]] = {}
    for entry in payload.get("entries", []):
        word = str(entry.get("form", ""))
        if word not in selected:
            continue
        enriched = dict(entry)
        enriched["_topic_relevance"] = topic_relevance(enriched, topic)
        enriched["_strong_topic_relevance"] = strong_topic_relevance(enriched, topic)
        metadata[word] = enriched
    return metadata


def select_initial_words(level: int, topic: str, count: int) -> List[str]:
    payload = load_semantic_payload()
    normalized_topic = topic.strip().lower()
    abstract_like_topics = {"abstract", "actions"}

    def matches(entry: Dict[str, object], selected_topic: str) -> bool:
        topics, semantic_topics = entry_topics(entry)
        return selected_topic in topics

    def word_score(entry: Dict[str, object], selected_topic: str) -> tuple[int, int, int, int, int, int, str]:
        topics, semantic_topics = entry_topics(entry)
        quality = int(entry.get("quality_score", 0))
        semantic = entry.get("semantic", {})
        semantic_match = 1 if semantic.get("matched") else 0
        glossary_bonus = min(3, len(semantic.get("glosses", [])))
        word = str(entry.get("form", ""))
        length = len(word)
        topical_concreteness_penalty = 0
        topic_bonus = 0
        pos_bonus = 0
        if selected_topic in topics:
            topic_bonus += 4
        if "general" in topics:
            topic_bonus += 1
        if str(entry.get("pos", "")) == "NOUN":
            pos_bonus += 4
        elif str(entry.get("pos", "")) == "ADJ":
            pos_bonus += 1
        if selected_topic not in abstract_like_topics and selected_topic != DEFAULT_TOPIC:
            if "abstract" in topics and selected_topic not in topics:
                topical_concreteness_penalty -= 3
            if "actions" in topics and selected_topic not in topics:
                topical_concreteness_penalty -= 2
            if word.endswith(ABSTRACTISH_SUFFIXES):
                topical_concreteness_penalty -= 4
            if str(entry.get("pos", "")) != "NOUN":
                topical_concreteness_penalty -= 3
        if 5 <= length <= 10:
            length_bonus = 3
        elif 4 <= length <= 12:
            length_bonus = 1
        else:
            length_bonus = -2
        return (
            topic_bonus,
            pos_bonus,
            topical_concreteness_penalty,
            quality,
            semantic_match,
            glossary_bonus,
            length_bonus,
            word,
        )

    def is_seed_friendly(entry: Dict[str, object], selected_topic: str) -> bool:
        word = str(entry.get("form", ""))
        pos = str(entry.get("pos", ""))
        topics, semantic_topics = entry_topics(entry)
        topic_hit = selected_topic in topics
        if len(word) < 4 or len(word) > 13:
            return False
        if selected_topic in CONCRETE_TOPICS and pos != "NOUN":
            return False
        if selected_topic in CONCRETE_TOPICS and word.endswith(ABSTRACTISH_SUFFIXES):
            return False
        blocked_substrings = TOPIC_SEED_BLOCKED_SUBSTRINGS.get(selected_topic, ())
        if any(part in word for part in blocked_substrings):
            return False
        required_substrings = TOPIC_SEED_REQUIRED_SUBSTRINGS.get(selected_topic)
        if (
            selected_topic in CONCRETE_TOPICS
            and required_substrings
            and selected_topic != DEFAULT_TOPIC
            and not any(part in word for part in required_substrings)
        ):
            return False
        if selected_topic != DEFAULT_TOPIC and not topic_hit:
            return False
        return True

    def overlap_score(left: str, right: str) -> int:
        shared = set(left) & set(right)
        return sum(min(left.count(ch), right.count(ch)) for ch in shared)

    def pick_seed_set(entries: List[Dict[str, object]], selected_topic: str, target_count: int) -> List[str]:
        if not entries:
            return []

        ranked = sorted(entries, key=lambda entry: word_score(entry, selected_topic), reverse=True)
        chosen: List[str] = []
        chosen_entries: List[Dict[str, object]] = []

        first = ranked[0]
        chosen.append(str(first["form"]))
        chosen_entries.append(first)

        while len(chosen) < target_count:
            best_entry = None
            best_key = None
            for entry in ranked:
                word = str(entry.get("form", ""))
                if word in chosen:
                    continue
                overlap_total = sum(overlap_score(word, existing) for existing in chosen)
                max_overlap = max((overlap_score(word, existing) for existing in chosen), default=0)
                distinct_letters = len(set(word))
                same_length_penalty = -sum(1 for existing in chosen if len(existing) == len(word))
                key = (
                    1 if max_overlap >= 2 else 0,
                    overlap_total,
                    max_overlap,
                    same_length_penalty,
                    distinct_letters,
                    word_score(entry, selected_topic),
                )
                if best_key is None or key > best_key:
                    best_key = key
                    best_entry = entry
            if best_entry is None:
                break
            chosen.append(str(best_entry["form"]))
            chosen_entries.append(best_entry)

        return chosen

    eligible = [
        entry
        for entry in payload.get("entries", [])
        if entry.get("allowed_in_crossword", False)
        and int(entry.get("difficulty_word", 5)) <= level
    ]

    lexical_topical = []
    for entry in eligible:
        topics, semantic_topics = entry_topics(entry)
        if normalized_topic in topics:
            lexical_topical.append(entry)
    fallback = [entry for entry in eligible if matches(entry, DEFAULT_TOPIC)]
    if normalized_topic == DEFAULT_TOPIC:
        pool = fallback
    else:
        pool = list(lexical_topical)
        if not pool:
            pool = fallback

    strict_pool = [entry for entry in pool if is_seed_friendly(entry, normalized_topic)]
    relaxed_pool = sorted(pool, key=lambda entry: word_score(entry, normalized_topic), reverse=True)

    selected = pick_seed_set(strict_pool, normalized_topic, count)
    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
        relaxed_selected = pick_seed_set(relaxed_pool, normalized_topic, count)
        for word in relaxed_selected:
            if word not in selected:
                selected.append(word)
            if len(selected) >= count:
                break

    if len(selected) < count and normalized_topic == DEFAULT_TOPIC:
        for word in WORDS:
            if word in selected:
                continue
            selected.append(word)
            if len(selected) >= count:
                break

    return selected[:count]


def main() -> None:
    args = parse_args()
    ensure_vocabulary(args)
    ensure_lexicon(args)
    ensure_semantic_lexicon(args)
    difficulty_level = parse_difficulty(args.difficulty)
    initial_words = select_initial_words(difficulty_level, args.topic, args.initial_word_count)

    generator = CrosswordGenerator(
        initial_words,
        diffxy=args.diffxy,
        time_limit_seconds=args.time_limit,
        max_candidates_per_word=args.max_candidates,
        seed=args.seed,
    )
    initial_state = generator.solve()

    print("Griglia iniziale")
    print(f"Parole-seme richieste: {len(initial_words)}")
    print(f"Parole inserite: {initial_state.placed_words}/{len(generator.words)}")
    print(f"Intersezioni: {initial_state.intersections}")
    print(f"Dimensioni: {initial_state.width()} x {initial_state.height()} (diff={initial_state.shape_difference()})")
    print(f"Difficolta filler: {args.difficulty} -> livello {difficulty_level}")
    print(f"Tema filler: {args.topic}")
    if args.seed is not None:
        print(f"Seed: {args.seed}")
    print()
    print(render_grid(initial_state.grid, initial_state.placements))
    print()
    print("Parole-seme selezionate:")
    print(", ".join(initial_words))

    if args.skip_fill:
        return

    vocabulary = load_selected_vocabulary(args.vocabulary) if args.vocabulary else load_filtered_vocabulary(difficulty_level, args.topic)
    metadata = load_vocabulary_metadata()
    semantic_metadata = load_semantic_metadata_for_vocabulary(vocabulary, args.topic) if not args.vocabulary else {}
    filler = CrosswordFiller(
        initial_state,
        vocabulary,
        target_empty_ratio=args.target_empty_ratio,
        vocabulary_metadata=metadata,
        semantic_metadata=semantic_metadata,
        selected_topic=args.topic,
        max_themed_fill_words=args.themed_fill_count,
        seed=args.seed,
    )
    final_state = filler.fill()

    print()
    print("Griglia riempita")
    print(f"Parole totali: {final_state.placed_words}")
    print(f"Intersezioni totali: {final_state.intersections}")
    print(f"Dimensioni: {final_state.width()} x {final_state.height()} (diff={final_state.shape_difference()})")
    print()
    print(render_grid(final_state.grid, final_state.placements))

    if filler.added_words:
        print()
        print("Parole aggiunte dal filler:")
        for index, placement in enumerate(filler.added_words, start=1):
            direction = "orizzontale" if placement.direction == "H" else "verticale"
            print(f"{index:>2}. {placement.word} ({placement.x}, {placement.y}) {direction}")


if __name__ == "__main__":
    main()