#!/usr/bin/env python3
"""
数据整理脚本：把 OCR 结果处理为结构化绘本课程数据
- 过滤封面/扉页/版权页，只保留正文内容页
- 拆分句子，提取单词
- 关联已有 phonics-data.json 的拼读数据
- 输出标准化 JSON 供前端使用
"""
import os
import sys
import json
import re

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app import config

OCR_PATH = str(config.OCR_DIR)
PHONICS_PATH = str(config.PHONICS_FILE)
OUTPUT_PATH = str(config.STRUCTURED_DIR)
STAGES = ["stage-03", "stage-04"]

# 需要过滤的非正文页面特征
SKIP_PATTERNS = [
    r"oxford reading tree",
    r"roderick hunt",
    r"alex brychta",
    r"story written by",
    r"talk together",
    r"word recognition",
    r"language comprehension",
    r"read the story",
    r"page \d+.*ask",
    r"sounding out",
    r"blending them",
]

def load_phonics_data():
    """加载已有的拼读数据"""
    with open(PHONICS_PATH, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # 建立 word -> phonics_breakdown 的映射
    word_phonics = {}
    if "words" in data:
        for word, info in data["words"].items():
            if "letterPhonemes" in info:
                breakdown = []
                for lp in info["letterPhonemes"]:
                    breakdown.append({
                        "letters": lp.get("letter", ""),
                        "phoneme": lp.get("phoneme", ""),
                    })
                word_phonics[word.lower()] = breakdown
    return word_phonics

def is_content_page(page_data):
    """判断是否为正文内容页（过滤封面/扉页/教师指导页）"""
    text = page_data.get("full_text", "").lower()
    if not text or len(text) < 5:
        return False
    for pattern in SKIP_PATTERNS:
        if re.search(pattern, text):
            return False
    # 过滤纯数字页（页码）
    if re.match(r'^\d+$', text.strip()):
        return False
    return True

def extract_sentences(full_text):
    """从 full_text 提取句子列表"""
    # 清理 OCR 噪音
    text = full_text.strip()
    # 按句号/感叹号/问号分割
    sentences = re.split(r'([.!?:]+)', text)
    result = []
    i = 0
    while i < len(sentences):
        s = sentences[i].strip()
        # 把标点附加回句子
        if i + 1 < len(sentences) and re.match(r'^[.!?:]+$', sentences[i+1]):
            s += sentences[i+1]
            i += 2
        else:
            i += 1
        if s and len(s) > 2:
            result.append(s)
    return result

def extract_words_from_sentence(sentence):
    """从句子中提取单词列表（去标点，小写）"""
    words = re.findall(r"[a-zA-Z']+", sentence)
    return [w.lower() for w in words if len(w) > 0]

def process_book(ocr_file, word_phonics):
    """处理一本绘本的 OCR 数据"""
    with open(ocr_file, 'r', encoding='utf-8') as f:
        ocr_data = json.load(f)

    book = {
        "stage": ocr_data["stage"],
        "book_folder": ocr_data["book_folder"],
        "total_pages": ocr_data["page_count"],
        "content_pages": [],
        "all_words": set(),
        "word_locations": {},  # word -> [{page, sentence}]
    }

    for page_data in ocr_data["pages"]:
        if not is_content_page(page_data):
            continue

        sentences = extract_sentences(page_data["full_text"])
        if not sentences:
            continue

        page_info = {
            "page": page_data["page"],
            "file": page_data["file"],
            "sentences": [],
            "ocr_words": page_data.get("words", []),
        }

        for sent in sentences:
            words = extract_words_from_sentence(sent)
            word_details = []
            for w in words:
                detail = {"word": w}
                if w in word_phonics:
                    detail["phonics"] = word_phonics[w]
                word_details.append(detail)
                book["all_words"].add(w)
                if w not in book["word_locations"]:
                    book["word_locations"][w] = []
                book["word_locations"][w].append({
                    "page": page_data["page"],
                    "sentence": sent,
                })

            page_info["sentences"].append({
                "text": sent,
                "words": word_details,
            })

        book["content_pages"].append(page_info)

    # 转 set 为 list
    book["all_words"] = sorted(list(book["all_words"]))
    # 统计有拼读数据的单词
    book["words_with_phonics"] = [w for w in book["all_words"] if w in word_phonics]
    book["words_without_phonics"] = [w for w in book["all_words"] if w not in word_phonics]

    return book

def main():
    print("加载拼读数据...")
    word_phonics = load_phonics_data()
    print(f"已加载 {len(word_phonics)} 个单词的拼读数据")

    os.makedirs(OUTPUT_PATH, exist_ok=True)

    for stage in STAGES:
        stage_ocr_path = os.path.join(OCR_PATH, stage)
        if not os.path.exists(stage_ocr_path):
            continue

        ocr_files = sorted([f for f in os.listdir(stage_ocr_path) if f.endswith('.json')])
        print(f"\n处理 {stage}：共 {len(ocr_files)} 本")

        stage_output = os.path.join(OUTPUT_PATH, stage)
        os.makedirs(stage_output, exist_ok=True)

        stage_stats = {"books": 0, "pages": 0, "sentences": 0, "unique_words": set()}

        for i, ocr_file in enumerate(ocr_files):
            filepath = os.path.join(stage_ocr_path, ocr_file)
            book = process_book(filepath, word_phonics)

            # 保存
            output_file = os.path.join(stage_output, ocr_file)
            # 序列化前处理 word_locations（只保留前3个引用）
            book_output = {
                "stage": book["stage"],
                "book_folder": book["book_folder"],
                "total_pages": book["total_pages"],
                "content_page_count": len(book["content_pages"]),
                "unique_word_count": len(book["all_words"]),
                "words_with_phonics_count": len(book["words_with_phonics"]),
                "content_pages": book["content_pages"],
                "all_words": book["all_words"],
                "words_with_phonics": book["words_with_phonics"],
                "words_without_phonics": book["words_without_phonics"],
            }

            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(book_output, f, ensure_ascii=False, indent=2)

            stage_stats["books"] += 1
            stage_stats["pages"] += len(book["content_pages"])
            stage_stats["sentences"] += sum(len(p["sentences"]) for p in book["content_pages"])
            stage_stats["unique_words"].update(book["all_words"])

            print(f"  [{i+1}/{len(ocr_files)}] {book['book_folder']}: "
                  f"{len(book['content_pages'])}页正文, "
                  f"{len(book['all_words'])}词, "
                  f"{len(book['words_with_phonics'])}有拼读")

        print(f"\n  {stage} 汇总: {stage_stats['books']}本, "
              f"{stage_stats['pages']}页正文, "
              f"{stage_stats['sentences']}句, "
              f"{len(stage_stats['unique_words'])}个不重复单词")

if __name__ == "__main__":
    main()
