244 lines
5.6 KiB
Python
244 lines
5.6 KiB
Python
from __future__ import annotations
|
||
|
||
import json
|
||
import re
|
||
import unicodedata
|
||
from pathlib import Path
|
||
from typing import Dict, Iterable, List, Optional
|
||
|
||
|
||
PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
|
||
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
|
||
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
|
||
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
|
||
MIN_WORD_LENGTH = 2
|
||
MAX_WORD_LENGTH = 14
|
||
|
||
COMMON_FUNCTION_WORDS = {
|
||
"a",
|
||
"ad",
|
||
"al",
|
||
"allo",
|
||
"ai",
|
||
"agli",
|
||
"alla",
|
||
"alle",
|
||
"con",
|
||
"col",
|
||
"coi",
|
||
"da",
|
||
"dal",
|
||
"dallo",
|
||
"dai",
|
||
"dagli",
|
||
"dalla",
|
||
"dalle",
|
||
"di",
|
||
"del",
|
||
"dello",
|
||
"dei",
|
||
"degli",
|
||
"della",
|
||
"delle",
|
||
"e",
|
||
"ed",
|
||
"in",
|
||
"nel",
|
||
"nello",
|
||
"nei",
|
||
"negli",
|
||
"nella",
|
||
"nelle",
|
||
"o",
|
||
"od",
|
||
"per",
|
||
"su",
|
||
"sul",
|
||
"sullo",
|
||
"sui",
|
||
"sugli",
|
||
"sulla",
|
||
"sulle",
|
||
"tra",
|
||
"fra",
|
||
}
|
||
|
||
COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
|
||
COMMON_ADVERB_SUFFIXES = ("mente",)
|
||
COMMON_NOUN_SUFFIXES = (
|
||
"zione",
|
||
"zioni",
|
||
"tore",
|
||
"tori",
|
||
"trice",
|
||
"trici",
|
||
"ista",
|
||
"isti",
|
||
"ismo",
|
||
"ismi",
|
||
"anza",
|
||
"enze",
|
||
"enza",
|
||
"ezza",
|
||
"ezze",
|
||
"ita",
|
||
"ore",
|
||
"ori",
|
||
)
|
||
COMMON_ADJECTIVE_SUFFIXES = (
|
||
"ale",
|
||
"ali",
|
||
"oso",
|
||
"osa",
|
||
"osi",
|
||
"ose",
|
||
"ivo",
|
||
"iva",
|
||
"ivi",
|
||
"ive",
|
||
"ente",
|
||
"enti",
|
||
)
|
||
|
||
|
||
def maybe_fix_mojibake(text: str) -> str:
|
||
if "Ã" not in text and "Â" not in text:
|
||
return text
|
||
try:
|
||
return text.encode("latin-1").decode("utf-8")
|
||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||
return text
|
||
|
||
|
||
def strip_accents(text: str) -> str:
|
||
normalized = unicodedata.normalize("NFKD", text)
|
||
return "".join(char for char in normalized if not unicodedata.combining(char))
|
||
|
||
|
||
def normalize_word(word: str) -> Optional[str]:
|
||
clean = maybe_fix_mojibake(word.strip().lower())
|
||
clean = clean.replace("’", "'").replace("`", "'")
|
||
clean = strip_accents(clean)
|
||
clean = clean.replace("'", "")
|
||
clean = clean.replace("-", "")
|
||
clean = clean.replace(" ", "")
|
||
|
||
if len(clean) < MIN_WORD_LENGTH:
|
||
return None
|
||
if not re.fullmatch(r"[a-z]+", clean):
|
||
return None
|
||
return clean
|
||
|
||
|
||
def categorize_word(word: str) -> Dict[str, object]:
|
||
tags: List[str] = []
|
||
score = 0
|
||
|
||
if word in COMMON_FUNCTION_WORDS:
|
||
tags.append("function")
|
||
score += 6
|
||
|
||
if word.endswith(COMMON_VERB_SUFFIXES):
|
||
tags.append("verb_infinitive")
|
||
score += 4
|
||
|
||
if word.endswith(COMMON_ADVERB_SUFFIXES):
|
||
tags.append("adverb")
|
||
score += 3
|
||
|
||
if word.endswith(COMMON_NOUN_SUFFIXES):
|
||
tags.append("noun_like")
|
||
score += 3
|
||
|
||
if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
|
||
tags.append("adjective_like")
|
||
score += 2
|
||
|
||
if len(word) <= 4:
|
||
tags.append("short")
|
||
score += 2
|
||
elif 5 <= len(word) <= 9:
|
||
tags.append("medium")
|
||
score += 3
|
||
else:
|
||
tags.append("long")
|
||
score += 1
|
||
|
||
if len(set(word)) >= max(4, len(word) // 2):
|
||
tags.append("varied_letters")
|
||
score += 2
|
||
|
||
penalty = 0
|
||
repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
|
||
if repeated_run >= 3:
|
||
tags.append("repetition_penalty")
|
||
penalty += 3
|
||
|
||
consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
|
||
if consonant_clusters:
|
||
tags.append("cluster_penalty")
|
||
penalty += 2
|
||
|
||
if len(word) > MAX_WORD_LENGTH:
|
||
tags.append("too_long")
|
||
penalty += 6
|
||
|
||
quality = score - penalty
|
||
return {"tags": sorted(set(tags)), "quality": quality}
|
||
|
||
|
||
def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
|
||
tags = set(meta["tags"])
|
||
quality = int(meta["quality"])
|
||
|
||
if "too_long" in tags:
|
||
return False
|
||
if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
|
||
return False
|
||
if quality < 2:
|
||
return False
|
||
return True
|
||
|
||
|
||
def extract_words(raw_words: Iterable[str]) -> List[str]:
|
||
normalized = set()
|
||
for word in raw_words:
|
||
clean = normalize_word(word)
|
||
if clean is not None:
|
||
normalized.add(clean)
|
||
return sorted(normalized)
|
||
|
||
|
||
def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
|
||
payload = json.loads(source_path.read_text(encoding="utf-8"))
|
||
words = extract_words(payload.keys())
|
||
output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
|
||
|
||
metadata = {word: categorize_word(word) for word in words}
|
||
filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
|
||
|
||
FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
|
||
METADATA_OUTPUT_PATH.write_text(
|
||
json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
|
||
encoding="utf-8",
|
||
)
|
||
return {
|
||
"extended_words": len(words),
|
||
"filtered_words": len(filtered_words),
|
||
"metadata_entries": len(metadata),
|
||
}
|
||
|
||
|
||
def main() -> None:
|
||
totals = build_vocabulary()
|
||
print(f"Vocabolario esteso: {OUTPUT_PATH}")
|
||
print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
|
||
print(f"Metadati: {METADATA_OUTPUT_PATH}")
|
||
print(f"Parole estese: {totals['extended_words']}")
|
||
print(f"Parole filtrate: {totals['filtered_words']}")
|
||
print(f"Metadati generati: {totals['metadata_entries']}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|