#!/usr/bin/env python3
"""
绘本文字提取脚本 - 使用 EasyOCR 提取图片中的文字+坐标
输出 JSON 格式：每页的单词列表、句子、bbox坐标
"""
import os
import json
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app import config

import easyocr

# 配置（基于 $workhome，可用环境变量覆盖）
BASE_PATH = str(config.IMAGES_DIR)      # 绘本图片源（OCR 输入）
OUTPUT_PATH = str(config.OCR_DIR)       # OCR 结果输出
STAGES = ["stage-03", "stage-04"]  # 当前学习的阶段

def extract_book(reader, stage, book_folder):
    """提取一本绘本所有页面的文字"""
    book_path = os.path.join(BASE_PATH, stage, book_folder)
    pages = sorted([f for f in os.listdir(book_path) if f.endswith('.png')])
    
    book_data = {
        "stage": stage,
        "book_folder": book_folder,
        "page_count": len(pages),
        "pages": []
    }
    
    for page_file in pages:
        page_path = os.path.join(book_path, page_file)
        page_num = int(page_file.replace('.png', ''))
        
        # EasyOCR 识别
        results = reader.readtext(page_path)
        
        words = []
        full_text = ""
        for (bbox, text, confidence) in results:
            if confidence < 0.3:  # 过滤低置信度
                continue
            # bbox 是 4个点的坐标 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]]
            # 转为 [x, y, w, h] 格式
            x_coords = [p[0] for p in bbox]
            y_coords = [p[1] for p in bbox]
            x = min(x_coords)
            y = min(y_coords)
            w = max(x_coords) - x
            h = max(y_coords) - y
            
            words.append({
                "text": text,
                "bbox": [round(x), round(y), round(w), round(h)],
                "confidence": round(confidence, 3)
            })
            full_text += text + " "
        
        book_data["pages"].append({
            "page": page_num,
            "file": page_file,
            "full_text": full_text.strip(),
            "words": words
        })
    
    return book_data

def main():
    # 初始化 EasyOCR（英文模型）
    print("正在加载 EasyOCR 模型...")
    reader = easyocr.Reader(['en'], gpu=False)
    print("模型加载完成")
    
    os.makedirs(OUTPUT_PATH, exist_ok=True)
    
    for stage in STAGES:
        stage_path = os.path.join(BASE_PATH, stage)
        if not os.path.exists(stage_path):
            print(f"跳过 {stage}：路径不存在")
            continue
        
        books = sorted(os.listdir(stage_path))
        print(f"\n处理 {stage}：共 {len(books)} 本绘本")
        
        for i, book_folder in enumerate(books):
            book_path = os.path.join(stage_path, book_folder)
            if not os.path.isdir(book_path):
                continue
            
            # 检查是否已处理
            output_file = os.path.join(OUTPUT_PATH, stage, f"{book_folder}.json")
            if os.path.exists(output_file):
                print(f"  [{i+1}/{len(books)}] {book_folder} - 已存在，跳过")
                continue
            
            print(f"  [{1}/{len(books)}] {book_folder} ...", end=" ", flush=True)
            
            try:
                book_data = extract_book(reader, stage, book_folder)
                
                # 保存
                os.makedirs(os.path.dirname(output_file), exist_ok=True)
                with open(output_file, 'w', encoding='utf-8') as f:
                    json.dump(book_data, f, ensure_ascii=False, indent=2)
                
                total_words = sum(len(p["words"]) for p in book_data["pages"])
                print(f"完成 ({book_data['page_count']}页, {total_words}个文本块)")
            except Exception as e:
                print(f"错误: {e}")

if __name__ == "__main__":
    main()
