from __future__ import annotations import json import re import unicodedata from pathlib import Path from typing import Dict, Iterable, List, Optional PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json" OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt") FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt") METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json") MIN_WORD_LENGTH = 2 MAX_WORD_LENGTH = 14 COMMON_FUNCTION_WORDS = { "a", "ad", "al", "allo", "ai", "agli", "alla", "alle", "con", "col", "coi", "da", "dal", "dallo", "dai", "dagli", "dalla", "dalle", "di", "del", "dello", "dei", "degli", "della", "delle", "e", "ed", "in", "nel", "nello", "nei", "negli", "nella", "nelle", "o", "od", "per", "su", "sul", "sullo", "sui", "sugli", "sulla", "sulle", "tra", "fra", } COMMON_VERB_SUFFIXES = ("are", "ere", "ire") COMMON_ADVERB_SUFFIXES = ("mente",) COMMON_NOUN_SUFFIXES = ( "zione", "zioni", "tore", "tori", "trice", "trici", "ista", "isti", "ismo", "ismi", "anza", "enze", "enza", "ezza", "ezze", "ita", "ore", "ori", ) COMMON_ADJECTIVE_SUFFIXES = ( "ale", "ali", "oso", "osa", "osi", "ose", "ivo", "iva", "ivi", "ive", "ente", "enti", ) def maybe_fix_mojibake(text: str) -> str: if "Ã" not in text and "Â" not in text: return text try: return text.encode("latin-1").decode("utf-8") except (UnicodeEncodeError, UnicodeDecodeError): return text def strip_accents(text: str) -> str: normalized = unicodedata.normalize("NFKD", text) return "".join(char for char in normalized if not unicodedata.combining(char)) def normalize_word(word: str) -> Optional[str]: clean = maybe_fix_mojibake(word.strip().lower()) clean = clean.replace("’", "'").replace("`", "'") clean = strip_accents(clean) clean = clean.replace("'", "") clean = clean.replace("-", "") clean = clean.replace(" ", "") if len(clean) < MIN_WORD_LENGTH: return None if not re.fullmatch(r"[a-z]+", clean): return None return clean def categorize_word(word: str) -> Dict[str, object]: tags: List[str] = [] score = 0 if word in COMMON_FUNCTION_WORDS: tags.append("function") score += 6 if word.endswith(COMMON_VERB_SUFFIXES): tags.append("verb_infinitive") score += 4 if word.endswith(COMMON_ADVERB_SUFFIXES): tags.append("adverb") score += 3 if word.endswith(COMMON_NOUN_SUFFIXES): tags.append("noun_like") score += 3 if word.endswith(COMMON_ADJECTIVE_SUFFIXES): tags.append("adjective_like") score += 2 if len(word) <= 4: tags.append("short") score += 2 elif 5 <= len(word) <= 9: tags.append("medium") score += 3 else: tags.append("long") score += 1 if len(set(word)) >= max(4, len(word) // 2): tags.append("varied_letters") score += 2 penalty = 0 repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0) if repeated_run >= 3: tags.append("repetition_penalty") penalty += 3 consonant_clusters = re.findall(r"[^aeiou]{4,}", word) if consonant_clusters: tags.append("cluster_penalty") penalty += 2 if len(word) > MAX_WORD_LENGTH: tags.append("too_long") penalty += 6 quality = score - penalty return {"tags": sorted(set(tags)), "quality": quality} def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool: tags = set(meta["tags"]) quality = int(meta["quality"]) if "too_long" in tags: return False if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5: return False if quality < 2: return False return True def extract_words(raw_words: Iterable[str]) -> List[str]: normalized = set() for word in raw_words: clean = normalize_word(word) if clean is not None: normalized.add(clean) return sorted(normalized) def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]: payload = json.loads(source_path.read_text(encoding="utf-8")) words = extract_words(payload.keys()) output_path.write_text("\n".join(words) + "\n", encoding="utf-8") metadata = {word: categorize_word(word) for word in words} filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])] FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8") METADATA_OUTPUT_PATH.write_text( json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True), encoding="utf-8", ) return { "extended_words": len(words), "filtered_words": len(filtered_words), "metadata_entries": len(metadata), } def main() -> None: totals = build_vocabulary() print(f"Vocabolario esteso: {OUTPUT_PATH}") print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}") print(f"Metadati: {METADATA_OUTPUT_PATH}") print(f"Parole estese: {totals['extended_words']}") print(f"Parole filtrate: {totals['filtered_words']}") print(f"Metadati generati: {totals['metadata_entries']}") if __name__ == "__main__": main()