feat: aggiunge CLI unificata, build vocabolario e filtro lessicale

This commit is contained in:
2026-04-13 19:04:01 +02:00
parent 0b42c2ecd4
commit 77c7e709b6
24 changed files with 161767 additions and 1 deletions

243
build_vocabulary.py Normal file
View File

@@ -0,0 +1,243 @@
from __future__ import annotations
import json
import re
import unicodedata
from pathlib import Path
from typing import Dict, Iterable, List, Optional
PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
MIN_WORD_LENGTH = 2
MAX_WORD_LENGTH = 14
COMMON_FUNCTION_WORDS = {
"a",
"ad",
"al",
"allo",
"ai",
"agli",
"alla",
"alle",
"con",
"col",
"coi",
"da",
"dal",
"dallo",
"dai",
"dagli",
"dalla",
"dalle",
"di",
"del",
"dello",
"dei",
"degli",
"della",
"delle",
"e",
"ed",
"in",
"nel",
"nello",
"nei",
"negli",
"nella",
"nelle",
"o",
"od",
"per",
"su",
"sul",
"sullo",
"sui",
"sugli",
"sulla",
"sulle",
"tra",
"fra",
}
COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
COMMON_ADVERB_SUFFIXES = ("mente",)
COMMON_NOUN_SUFFIXES = (
"zione",
"zioni",
"tore",
"tori",
"trice",
"trici",
"ista",
"isti",
"ismo",
"ismi",
"anza",
"enze",
"enza",
"ezza",
"ezze",
"ita",
"ore",
"ori",
)
COMMON_ADJECTIVE_SUFFIXES = (
"ale",
"ali",
"oso",
"osa",
"osi",
"ose",
"ivo",
"iva",
"ivi",
"ive",
"ente",
"enti",
)
def maybe_fix_mojibake(text: str) -> str:
if "Ã" not in text and "Â" not in text:
return text
try:
return text.encode("latin-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def strip_accents(text: str) -> str:
normalized = unicodedata.normalize("NFKD", text)
return "".join(char for char in normalized if not unicodedata.combining(char))
def normalize_word(word: str) -> Optional[str]:
clean = maybe_fix_mojibake(word.strip().lower())
clean = clean.replace("", "'").replace("`", "'")
clean = strip_accents(clean)
clean = clean.replace("'", "")
clean = clean.replace("-", "")
clean = clean.replace(" ", "")
if len(clean) < MIN_WORD_LENGTH:
return None
if not re.fullmatch(r"[a-z]+", clean):
return None
return clean
def categorize_word(word: str) -> Dict[str, object]:
tags: List[str] = []
score = 0
if word in COMMON_FUNCTION_WORDS:
tags.append("function")
score += 6
if word.endswith(COMMON_VERB_SUFFIXES):
tags.append("verb_infinitive")
score += 4
if word.endswith(COMMON_ADVERB_SUFFIXES):
tags.append("adverb")
score += 3
if word.endswith(COMMON_NOUN_SUFFIXES):
tags.append("noun_like")
score += 3
if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
tags.append("adjective_like")
score += 2
if len(word) <= 4:
tags.append("short")
score += 2
elif 5 <= len(word) <= 9:
tags.append("medium")
score += 3
else:
tags.append("long")
score += 1
if len(set(word)) >= max(4, len(word) // 2):
tags.append("varied_letters")
score += 2
penalty = 0
repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
if repeated_run >= 3:
tags.append("repetition_penalty")
penalty += 3
consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
if consonant_clusters:
tags.append("cluster_penalty")
penalty += 2
if len(word) > MAX_WORD_LENGTH:
tags.append("too_long")
penalty += 6
quality = score - penalty
return {"tags": sorted(set(tags)), "quality": quality}
def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
tags = set(meta["tags"])
quality = int(meta["quality"])
if "too_long" in tags:
return False
if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
return False
if quality < 2:
return False
return True
def extract_words(raw_words: Iterable[str]) -> List[str]:
normalized = set()
for word in raw_words:
clean = normalize_word(word)
if clean is not None:
normalized.add(clean)
return sorted(normalized)
def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
payload = json.loads(source_path.read_text(encoding="utf-8"))
words = extract_words(payload.keys())
output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
metadata = {word: categorize_word(word) for word in words}
filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
METADATA_OUTPUT_PATH.write_text(
json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
encoding="utf-8",
)
return {
"extended_words": len(words),
"filtered_words": len(filtered_words),
"metadata_entries": len(metadata),
}
def main() -> None:
totals = build_vocabulary()
print(f"Vocabolario esteso: {OUTPUT_PATH}")
print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
print(f"Metadati: {METADATA_OUTPUT_PATH}")
print(f"Parole estese: {totals['extended_words']}")
print(f"Parole filtrate: {totals['filtered_words']}")
print(f"Metadati generati: {totals['metadata_entries']}")
if __name__ == "__main__":
main()