Files
cruciverba_1/build_vocabulary.py

244 lines
5.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from __future__ import annotations
import json
import re
import unicodedata
from pathlib import Path
from typing import Dict, Iterable, List, Optional
PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
MIN_WORD_LENGTH = 2
MAX_WORD_LENGTH = 14
COMMON_FUNCTION_WORDS = {
"a",
"ad",
"al",
"allo",
"ai",
"agli",
"alla",
"alle",
"con",
"col",
"coi",
"da",
"dal",
"dallo",
"dai",
"dagli",
"dalla",
"dalle",
"di",
"del",
"dello",
"dei",
"degli",
"della",
"delle",
"e",
"ed",
"in",
"nel",
"nello",
"nei",
"negli",
"nella",
"nelle",
"o",
"od",
"per",
"su",
"sul",
"sullo",
"sui",
"sugli",
"sulla",
"sulle",
"tra",
"fra",
}
COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
COMMON_ADVERB_SUFFIXES = ("mente",)
COMMON_NOUN_SUFFIXES = (
"zione",
"zioni",
"tore",
"tori",
"trice",
"trici",
"ista",
"isti",
"ismo",
"ismi",
"anza",
"enze",
"enza",
"ezza",
"ezze",
"ita",
"ore",
"ori",
)
COMMON_ADJECTIVE_SUFFIXES = (
"ale",
"ali",
"oso",
"osa",
"osi",
"ose",
"ivo",
"iva",
"ivi",
"ive",
"ente",
"enti",
)
def maybe_fix_mojibake(text: str) -> str:
if "Ã" not in text and "Â" not in text:
return text
try:
return text.encode("latin-1").decode("utf-8")
except (UnicodeEncodeError, UnicodeDecodeError):
return text
def strip_accents(text: str) -> str:
normalized = unicodedata.normalize("NFKD", text)
return "".join(char for char in normalized if not unicodedata.combining(char))
def normalize_word(word: str) -> Optional[str]:
clean = maybe_fix_mojibake(word.strip().lower())
clean = clean.replace("", "'").replace("`", "'")
clean = strip_accents(clean)
clean = clean.replace("'", "")
clean = clean.replace("-", "")
clean = clean.replace(" ", "")
if len(clean) < MIN_WORD_LENGTH:
return None
if not re.fullmatch(r"[a-z]+", clean):
return None
return clean
def categorize_word(word: str) -> Dict[str, object]:
tags: List[str] = []
score = 0
if word in COMMON_FUNCTION_WORDS:
tags.append("function")
score += 6
if word.endswith(COMMON_VERB_SUFFIXES):
tags.append("verb_infinitive")
score += 4
if word.endswith(COMMON_ADVERB_SUFFIXES):
tags.append("adverb")
score += 3
if word.endswith(COMMON_NOUN_SUFFIXES):
tags.append("noun_like")
score += 3
if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
tags.append("adjective_like")
score += 2
if len(word) <= 4:
tags.append("short")
score += 2
elif 5 <= len(word) <= 9:
tags.append("medium")
score += 3
else:
tags.append("long")
score += 1
if len(set(word)) >= max(4, len(word) // 2):
tags.append("varied_letters")
score += 2
penalty = 0
repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
if repeated_run >= 3:
tags.append("repetition_penalty")
penalty += 3
consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
if consonant_clusters:
tags.append("cluster_penalty")
penalty += 2
if len(word) > MAX_WORD_LENGTH:
tags.append("too_long")
penalty += 6
quality = score - penalty
return {"tags": sorted(set(tags)), "quality": quality}
def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
tags = set(meta["tags"])
quality = int(meta["quality"])
if "too_long" in tags:
return False
if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
return False
if quality < 2:
return False
return True
def extract_words(raw_words: Iterable[str]) -> List[str]:
normalized = set()
for word in raw_words:
clean = normalize_word(word)
if clean is not None:
normalized.add(clean)
return sorted(normalized)
def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
payload = json.loads(source_path.read_text(encoding="utf-8"))
words = extract_words(payload.keys())
output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
metadata = {word: categorize_word(word) for word in words}
filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
METADATA_OUTPUT_PATH.write_text(
json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
encoding="utf-8",
)
return {
"extended_words": len(words),
"filtered_words": len(filtered_words),
"metadata_entries": len(metadata),
}
def main() -> None:
totals = build_vocabulary()
print(f"Vocabolario esteso: {OUTPUT_PATH}")
print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
print(f"Metadati: {METADATA_OUTPUT_PATH}")
print(f"Parole estese: {totals['extended_words']}")
print(f"Parole filtrate: {totals['filtered_words']}")
print(f"Metadati generati: {totals['metadata_entries']}")
if __name__ == "__main__":
main()