feat: aggiunge CLI unificata, build vocabolario e filtro lessicale

2026-04-13 19:04:01 +02:00
parent 0b42c2ecd4
commit 77c7e709b6
24 changed files with 161767 additions and 1 deletions
--- a/build_vocabulary.py
+++ b/build_vocabulary.py
@@ -0,0 +1,243 @@
+from __future__ import annotations
+
+import json
+import re
+import unicodedata
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+
+
+PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
+OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
+FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
+METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
+MIN_WORD_LENGTH = 2
+MAX_WORD_LENGTH = 14
+
+COMMON_FUNCTION_WORDS = {
+    "a",
+    "ad",
+    "al",
+    "allo",
+    "ai",
+    "agli",
+    "alla",
+    "alle",
+    "con",
+    "col",
+    "coi",
+    "da",
+    "dal",
+    "dallo",
+    "dai",
+    "dagli",
+    "dalla",
+    "dalle",
+    "di",
+    "del",
+    "dello",
+    "dei",
+    "degli",
+    "della",
+    "delle",
+    "e",
+    "ed",
+    "in",
+    "nel",
+    "nello",
+    "nei",
+    "negli",
+    "nella",
+    "nelle",
+    "o",
+    "od",
+    "per",
+    "su",
+    "sul",
+    "sullo",
+    "sui",
+    "sugli",
+    "sulla",
+    "sulle",
+    "tra",
+    "fra",
+}
+
+COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
+COMMON_ADVERB_SUFFIXES = ("mente",)
+COMMON_NOUN_SUFFIXES = (
+    "zione",
+    "zioni",
+    "tore",
+    "tori",
+    "trice",
+    "trici",
+    "ista",
+    "isti",
+    "ismo",
+    "ismi",
+    "anza",
+    "enze",
+    "enza",
+    "ezza",
+    "ezze",
+    "ita",
+    "ore",
+    "ori",
+)
+COMMON_ADJECTIVE_SUFFIXES = (
+    "ale",
+    "ali",
+    "oso",
+    "osa",
+    "osi",
+    "ose",
+    "ivo",
+    "iva",
+    "ivi",
+    "ive",
+    "ente",
+    "enti",
+)
+
+
+def maybe_fix_mojibake(text: str) -> str:
+    if "Ã" not in text and "Â" not in text:
+        return text
+    try:
+        return text.encode("latin-1").decode("utf-8")
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        return text
+
+
+def strip_accents(text: str) -> str:
+    normalized = unicodedata.normalize("NFKD", text)
+    return "".join(char for char in normalized if not unicodedata.combining(char))
+
+
+def normalize_word(word: str) -> Optional[str]:
+    clean = maybe_fix_mojibake(word.strip().lower())
+    clean = clean.replace("’", "'").replace("`", "'")
+    clean = strip_accents(clean)
+    clean = clean.replace("'", "")
+    clean = clean.replace("-", "")
+    clean = clean.replace(" ", "")
+
+    if len(clean) < MIN_WORD_LENGTH:
+        return None
+    if not re.fullmatch(r"[a-z]+", clean):
+        return None
+    return clean
+
+
+def categorize_word(word: str) -> Dict[str, object]:
+    tags: List[str] = []
+    score = 0
+
+    if word in COMMON_FUNCTION_WORDS:
+        tags.append("function")
+        score += 6
+
+    if word.endswith(COMMON_VERB_SUFFIXES):
+        tags.append("verb_infinitive")
+        score += 4
+
+    if word.endswith(COMMON_ADVERB_SUFFIXES):
+        tags.append("adverb")
+        score += 3
+
+    if word.endswith(COMMON_NOUN_SUFFIXES):
+        tags.append("noun_like")
+        score += 3
+
+    if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
+        tags.append("adjective_like")
+        score += 2
+
+    if len(word) <= 4:
+        tags.append("short")
+        score += 2
+    elif 5 <= len(word) <= 9:
+        tags.append("medium")
+        score += 3
+    else:
+        tags.append("long")
+        score += 1
+
+    if len(set(word)) >= max(4, len(word) // 2):
+        tags.append("varied_letters")
+        score += 2
+
+    penalty = 0
+    repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
+    if repeated_run >= 3:
+        tags.append("repetition_penalty")
+        penalty += 3
+
+    consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
+    if consonant_clusters:
+        tags.append("cluster_penalty")
+        penalty += 2
+
+    if len(word) > MAX_WORD_LENGTH:
+        tags.append("too_long")
+        penalty += 6
+
+    quality = score - penalty
+    return {"tags": sorted(set(tags)), "quality": quality}
+
+
+def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
+    tags = set(meta["tags"])
+    quality = int(meta["quality"])
+
+    if "too_long" in tags:
+        return False
+    if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
+        return False
+    if quality < 2:
+        return False
+    return True
+
+
+def extract_words(raw_words: Iterable[str]) -> List[str]:
+    normalized = set()
+    for word in raw_words:
+        clean = normalize_word(word)
+        if clean is not None:
+            normalized.add(clean)
+    return sorted(normalized)
+
+
+def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
+    payload = json.loads(source_path.read_text(encoding="utf-8"))
+    words = extract_words(payload.keys())
+    output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
+
+    metadata = {word: categorize_word(word) for word in words}
+    filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
+
+    FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
+    METADATA_OUTPUT_PATH.write_text(
+        json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
+        encoding="utf-8",
+    )
+    return {
+        "extended_words": len(words),
+        "filtered_words": len(filtered_words),
+        "metadata_entries": len(metadata),
+    }
+
+
+def main() -> None:
+    totals = build_vocabulary()
+    print(f"Vocabolario esteso: {OUTPUT_PATH}")
+    print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
+    print(f"Metadati: {METADATA_OUTPUT_PATH}")
+    print(f"Parole estese: {totals['extended_words']}")
+    print(f"Parole filtrate: {totals['filtered_words']}")
+    print(f"Metadati generati: {totals['metadata_entries']}")
+
+
+if __name__ == "__main__":
+    main()