"""
CMU词典 + 字母-音素对齐，批量生成缺失单词的拼读数据。
用法: python generate_phonics.py
输出: 更新 phonics-data.json，然后重跑 process_book_data.py
"""
import json
import os
import sys
import re

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app import config

from nltk.corpus import cmudict

# Paths（基于 $workhome，可用环境变量覆盖）
PHONICS_DATA = str(config.PHONICS_FILE)
STRUCTURED_PATH = str(config.STRUCTURED_DIR)

# ARPABET to IPA
ARPABET_TO_IPA = {
    'AA': 'ɑː', 'AE': 'æ', 'AH': 'ʌ', 'AO': 'ɔː', 'AW': 'aʊ',
    'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
    'EH': 'e', 'ER': 'ɜːr', 'EY': 'eɪ', 'F': 'f', 'G': 'g',
    'HH': 'h', 'IH': 'ɪ', 'IY': 'iː', 'JH': 'dʒ', 'K': 'k',
    'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'əʊ',
    'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
    'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'uː', 'V': 'v',
    'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ'
}

# Reduced AH (schwa) uses different IPA
def arpabet_to_ipa(phone):
    base = phone.rstrip('012')
    stress = phone[-1] if phone[-1] in '012' else None
    ipa = ARPABET_TO_IPA.get(base, base)
    # AH with stress 0 is schwa
    if base == 'AH' and stress == '0':
        return 'ə'
    return ipa


def align_letters_phonemes(word, arpabet_phones):
    """
    Greedy letter-to-phoneme alignment.
    Returns list of {letter, phoneme} pairs.
    """
    ipa_phones = [arpabet_to_ipa(p) for p in arpabet_phones]
    letters = list(word.lower())
    n_letters = len(letters)
    n_phones = len(ipa_phones)

    # Known digraph -> phoneme mappings (letter_pattern, arpabet_base)
    DIGRAPHS = {
        'sh': ['SH'],
        'ch': ['CH', 'K'],  # ch can be /tʃ/ or /k/ (chorus)
        'th': ['TH', 'DH'],
        'ph': ['F'],
        'wh': ['W', 'HH'],
        'wr': ['R'],
        'kn': ['N'],
        'gn': ['N'],
        'ng': ['NG'],
        'ck': ['K'],
        'gh': ['G', 'F'],  # ghost vs enough
        'dg': ['JH'],
        'tch': ['CH'],
        'dge': ['JH'],
    }

    # Double letters that map to single phoneme
    DOUBLES = ['bb', 'cc', 'dd', 'ff', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss', 'tt', 'zz']

    result = []
    li = 0
    pi = 0

    while li < n_letters and pi < n_phones:
        phone_base = arpabet_phones[pi].rstrip('012')
        matched = False

        # Try trigraphs first
        for plen in [3, 2]:
            if li + plen <= n_letters:
                substr = ''.join(letters[li:li+plen])
                if substr in DIGRAPHS and phone_base in DIGRAPHS[substr]:
                    result.append({'letter': substr, 'phoneme': ipa_phones[pi]})
                    li += plen
                    pi += 1
                    matched = True
                    break
                # Double letters
                if plen == 2 and substr in DOUBLES:
                    # Check if this double maps to current phoneme
                    single = substr[0]
                    # Common single-letter mappings
                    single_map = {
                        'b': 'B', 'c': 'K', 'd': 'D', 'f': 'F', 'g': 'G',
                        'l': 'L', 'm': 'M', 'n': 'N', 'p': 'P', 'r': 'R',
                        's': 'S', 't': 'T', 'z': 'Z'
                    }
                    if single in single_map and single_map[single] == phone_base:
                        result.append({'letter': substr, 'phoneme': ipa_phones[pi]})
                        li += 2
                        pi += 1
                        matched = True
                        break
            if matched:
                break

        if matched:
            continue

        # Vowel digraphs: check if current + next vowel letter maps to current phoneme
        if li + 1 < n_letters:
            pair = letters[li] + letters[li+1]
            vowel_digraphs = {
                'ea': ['IY', 'EH', 'EY'],
                'ee': ['IY'],
                'oo': ['UW', 'UH'],
                'ai': ['EY'],
                'ay': ['EY'],
                'oa': ['OW'],
                'ou': ['AW', 'AO', 'UW', 'AH'],
                'ow': ['AW', 'OW'],
                'oi': ['OY'],
                'oy': ['OY'],
                'au': ['AO'],
                'aw': ['AO'],
                'ew': ['UW', 'Y'],
                'ie': ['IY', 'AY'],
                'ei': ['EY', 'IY'],
                'ue': ['UW'],
                'ir': ['ER'],
                'ur': ['ER'],
                'er': ['ER'],
                'ar': ['AA', 'ER'],
                'or': ['AO', 'ER'],
            }
            if pair in vowel_digraphs and phone_base in vowel_digraphs[pair]:
                result.append({'letter': pair, 'phoneme': ipa_phones[pi]})
                li += 2
                pi += 1
                continue

        # Magic-e: consonant + e at end, vowel phoneme
        # Skip for now, handle with remaining letters

        # Default: single letter -> single phoneme
        remaining_l = n_letters - li
        remaining_p = n_phones - pi

        # If we have more letters than phonemes, check for silent letters
        if remaining_l > remaining_p + 1:
            # 'x' often maps to 'K' + 'S' (two phonemes)
            if letters[li] == 'x' and phone_base == 'K' and pi + 1 < n_phones:
                next_base = arpabet_phones[pi+1].rstrip('012')
                if next_base == 'S':
                    result.append({'letter': 'x', 'phoneme': ipa_phones[pi] + ipa_phones[pi+1]})
                    li += 1
                    pi += 2
                    continue

        result.append({'letter': letters[li], 'phoneme': ipa_phones[pi]})
        li += 1
        pi += 1

    # Remaining letters (silent e, etc.) - append to last entry
    while li < n_letters:
        if result:
            result[-1]['letter'] += letters[li]
        else:
            result.append({'letter': letters[li], 'phoneme': ''})
        li += 1

    return result, ipa_phones


def get_missing_words():
    """Get all words from book-structured that lack phonics data."""
    missing = set()
    for stage in ['stage-03', 'stage-04']:
        stage_dir = os.path.join(STRUCTURED_PATH, stage)
        if not os.path.exists(stage_dir):
            continue
        for f in os.listdir(stage_dir):
            if not f.endswith('.json'):
                continue
            with open(os.path.join(stage_dir, f)) as fh:
                data = json.load(fh)
            for w in data.get('words_without_phonics', []):
                # Only clean alphabetic words
                if re.match(r'^[a-z]+$', w) and len(w) >= 2:
                    missing.add(w)
    return sorted(missing)


def main():
    # Load existing phonics data
    with open(PHONICS_DATA, 'r', encoding='utf-8') as f:
        phonics_data = json.load(f)

    existing_words = phonics_data.get('words', {})
    print(f"Existing phonics words: {len(existing_words)}")

    # Get missing words
    missing = get_missing_words()
    print(f"Missids in book data: {len(missing)}")

    # Load CMU dict
    cmu = cmudict.dict()
    print(f"CMU dict size: {len(cmu)}")

    # Generate phonics for missing words found in CMU dict
    generated = 0
    not_in_cmu = []

    for word in missing:
        if word in existing_words:
            continue
        if word not in cmu:
            not_in_cmu.append(word)
            continue

        phones = cmu[word][0]  # Take first pronunciation
        aligned, ipa_phones = align_letters_phonemes(word, phones)

        # Build entry matching existing format
        entry = {
            'phonemes': ipa_phones,
            'letterPhonemes': aligned,
            'highlight': '',
            'exampleWord': '',
            'exampleMeaning': ''
        }

        # Find highlight: first vowel letter with a stressed phoneme
        for i, a in enumerate(aligned):
            if a['letter'] in 'aeiou' and a['phoneme']:
                entry['highlight'] = a['letter']
                break

        existing_words[word] = entry
        generated += 1

    # Save updated phonics data
    phonics_data['words'] = existing_words
    with open(PHONICS_DATA, 'w', encoding='utf-8') as f:
        json.dump(phonics_data, f, ensure_ascii=False, indent=2)

    print(f"\nGenerated: {generated} words")
    print(f"Not in CMU dict: {len(not_in_cmu)} words")
    print(f"Total phonics words now: {len(existing_words)}")

    if not_in_cmu:
        # Save list for AI generation later
        not_in_cmu_path = str(config.DATA_DIR / "words-not-in-cmu.json")
        with open(not_in_cmu_path, 'w', encoding='utf-8') as f:
            json.dump(not_in_cmu, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(not_in_cmu)} words to words-not-in-cmu.json (need AI generation)")
        print(f"Sample: {not_in_cmu[:20]}")


if __name__ == '__main__':
    main()
