feat: aggiunge CLI unificata, build vocabolario e filtro lessicale
This commit is contained in:
243
build_vocabulary.py
Normal file
243
build_vocabulary.py
Normal file
@@ -0,0 +1,243 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional
|
||||
|
||||
|
||||
PACKAGE_WORDS_PATH = Path(__file__).with_name("package") / "dist" / "words.json"
|
||||
OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_esteso.txt")
|
||||
FILTERED_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_filtrato.txt")
|
||||
METADATA_OUTPUT_PATH = Path(__file__).with_name("vocaboli_it_metadata.json")
|
||||
MIN_WORD_LENGTH = 2
|
||||
MAX_WORD_LENGTH = 14
|
||||
|
||||
COMMON_FUNCTION_WORDS = {
|
||||
"a",
|
||||
"ad",
|
||||
"al",
|
||||
"allo",
|
||||
"ai",
|
||||
"agli",
|
||||
"alla",
|
||||
"alle",
|
||||
"con",
|
||||
"col",
|
||||
"coi",
|
||||
"da",
|
||||
"dal",
|
||||
"dallo",
|
||||
"dai",
|
||||
"dagli",
|
||||
"dalla",
|
||||
"dalle",
|
||||
"di",
|
||||
"del",
|
||||
"dello",
|
||||
"dei",
|
||||
"degli",
|
||||
"della",
|
||||
"delle",
|
||||
"e",
|
||||
"ed",
|
||||
"in",
|
||||
"nel",
|
||||
"nello",
|
||||
"nei",
|
||||
"negli",
|
||||
"nella",
|
||||
"nelle",
|
||||
"o",
|
||||
"od",
|
||||
"per",
|
||||
"su",
|
||||
"sul",
|
||||
"sullo",
|
||||
"sui",
|
||||
"sugli",
|
||||
"sulla",
|
||||
"sulle",
|
||||
"tra",
|
||||
"fra",
|
||||
}
|
||||
|
||||
COMMON_VERB_SUFFIXES = ("are", "ere", "ire")
|
||||
COMMON_ADVERB_SUFFIXES = ("mente",)
|
||||
COMMON_NOUN_SUFFIXES = (
|
||||
"zione",
|
||||
"zioni",
|
||||
"tore",
|
||||
"tori",
|
||||
"trice",
|
||||
"trici",
|
||||
"ista",
|
||||
"isti",
|
||||
"ismo",
|
||||
"ismi",
|
||||
"anza",
|
||||
"enze",
|
||||
"enza",
|
||||
"ezza",
|
||||
"ezze",
|
||||
"ita",
|
||||
"ore",
|
||||
"ori",
|
||||
)
|
||||
COMMON_ADJECTIVE_SUFFIXES = (
|
||||
"ale",
|
||||
"ali",
|
||||
"oso",
|
||||
"osa",
|
||||
"osi",
|
||||
"ose",
|
||||
"ivo",
|
||||
"iva",
|
||||
"ivi",
|
||||
"ive",
|
||||
"ente",
|
||||
"enti",
|
||||
)
|
||||
|
||||
|
||||
def maybe_fix_mojibake(text: str) -> str:
|
||||
if "Ã" not in text and "Â" not in text:
|
||||
return text
|
||||
try:
|
||||
return text.encode("latin-1").decode("utf-8")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
return text
|
||||
|
||||
|
||||
def strip_accents(text: str) -> str:
|
||||
normalized = unicodedata.normalize("NFKD", text)
|
||||
return "".join(char for char in normalized if not unicodedata.combining(char))
|
||||
|
||||
|
||||
def normalize_word(word: str) -> Optional[str]:
|
||||
clean = maybe_fix_mojibake(word.strip().lower())
|
||||
clean = clean.replace("’", "'").replace("`", "'")
|
||||
clean = strip_accents(clean)
|
||||
clean = clean.replace("'", "")
|
||||
clean = clean.replace("-", "")
|
||||
clean = clean.replace(" ", "")
|
||||
|
||||
if len(clean) < MIN_WORD_LENGTH:
|
||||
return None
|
||||
if not re.fullmatch(r"[a-z]+", clean):
|
||||
return None
|
||||
return clean
|
||||
|
||||
|
||||
def categorize_word(word: str) -> Dict[str, object]:
|
||||
tags: List[str] = []
|
||||
score = 0
|
||||
|
||||
if word in COMMON_FUNCTION_WORDS:
|
||||
tags.append("function")
|
||||
score += 6
|
||||
|
||||
if word.endswith(COMMON_VERB_SUFFIXES):
|
||||
tags.append("verb_infinitive")
|
||||
score += 4
|
||||
|
||||
if word.endswith(COMMON_ADVERB_SUFFIXES):
|
||||
tags.append("adverb")
|
||||
score += 3
|
||||
|
||||
if word.endswith(COMMON_NOUN_SUFFIXES):
|
||||
tags.append("noun_like")
|
||||
score += 3
|
||||
|
||||
if word.endswith(COMMON_ADJECTIVE_SUFFIXES):
|
||||
tags.append("adjective_like")
|
||||
score += 2
|
||||
|
||||
if len(word) <= 4:
|
||||
tags.append("short")
|
||||
score += 2
|
||||
elif 5 <= len(word) <= 9:
|
||||
tags.append("medium")
|
||||
score += 3
|
||||
else:
|
||||
tags.append("long")
|
||||
score += 1
|
||||
|
||||
if len(set(word)) >= max(4, len(word) // 2):
|
||||
tags.append("varied_letters")
|
||||
score += 2
|
||||
|
||||
penalty = 0
|
||||
repeated_run = max((len(match.group(0)) for match in re.finditer(r"(.)\1{2,}", word)), default=0)
|
||||
if repeated_run >= 3:
|
||||
tags.append("repetition_penalty")
|
||||
penalty += 3
|
||||
|
||||
consonant_clusters = re.findall(r"[^aeiou]{4,}", word)
|
||||
if consonant_clusters:
|
||||
tags.append("cluster_penalty")
|
||||
penalty += 2
|
||||
|
||||
if len(word) > MAX_WORD_LENGTH:
|
||||
tags.append("too_long")
|
||||
penalty += 6
|
||||
|
||||
quality = score - penalty
|
||||
return {"tags": sorted(set(tags)), "quality": quality}
|
||||
|
||||
|
||||
def is_good_crossword_word(word: str, meta: Dict[str, object]) -> bool:
|
||||
tags = set(meta["tags"])
|
||||
quality = int(meta["quality"])
|
||||
|
||||
if "too_long" in tags:
|
||||
return False
|
||||
if len(word) >= 13 and "function" not in tags and "verb_infinitive" not in tags and quality < 5:
|
||||
return False
|
||||
if quality < 2:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def extract_words(raw_words: Iterable[str]) -> List[str]:
|
||||
normalized = set()
|
||||
for word in raw_words:
|
||||
clean = normalize_word(word)
|
||||
if clean is not None:
|
||||
normalized.add(clean)
|
||||
return sorted(normalized)
|
||||
|
||||
|
||||
def build_vocabulary(source_path: Path = PACKAGE_WORDS_PATH, output_path: Path = OUTPUT_PATH) -> Dict[str, int]:
|
||||
payload = json.loads(source_path.read_text(encoding="utf-8"))
|
||||
words = extract_words(payload.keys())
|
||||
output_path.write_text("\n".join(words) + "\n", encoding="utf-8")
|
||||
|
||||
metadata = {word: categorize_word(word) for word in words}
|
||||
filtered_words = [word for word in words if is_good_crossword_word(word, metadata[word])]
|
||||
|
||||
FILTERED_OUTPUT_PATH.write_text("\n".join(filtered_words) + "\n", encoding="utf-8")
|
||||
METADATA_OUTPUT_PATH.write_text(
|
||||
json.dumps(metadata, ensure_ascii=False, indent=2, sort_keys=True),
|
||||
encoding="utf-8",
|
||||
)
|
||||
return {
|
||||
"extended_words": len(words),
|
||||
"filtered_words": len(filtered_words),
|
||||
"metadata_entries": len(metadata),
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
totals = build_vocabulary()
|
||||
print(f"Vocabolario esteso: {OUTPUT_PATH}")
|
||||
print(f"Vocabolario filtrato: {FILTERED_OUTPUT_PATH}")
|
||||
print(f"Metadati: {METADATA_OUTPUT_PATH}")
|
||||
print(f"Parole estese: {totals['extended_words']}")
|
||||
print(f"Parole filtrate: {totals['filtered_words']}")
|
||||
print(f"Metadati generati: {totals['metadata_entries']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user