#!/usr/bin/env python3
"""
音素发音自动提取工具（替代 video-cut 的手工标记）
=====================================================

背景
----
录音/视频内容结构：开头一段"测试朗读"，之后是若干「音标(IPA) + 样例词」朗读单元。
旧方案 251228-words/video-cut 是人工在波形上逐个标记开始/结束再用 ffmpeg 截取，
本工具用"静音切分 + （可选）语音识别"自动完成分段、跳过开头、映射音素并剪辑落盘。

输出约定（与现有拼读系统对齐，见 phoneme.json / config.PHONEMES_AUDIO_DIR）
------------------------------------------------------------------------
- 目标目录：config.PHONEMES_AUDIO_DIR（默认 251228-words/audio/pho-v1，Nginx /audio/phonemes/）
- 文件命名：依据 config.PHONEME_MAP_FILE（phoneme.json，IPA -> "NN.mp3"）。
  新音素自动追加编号并回写 phoneme.json。

处理流水线
----------
1) ffmpeg silencedetect 检测静音 → 反推"说话片段"(start,end)
2) 跳过开头测试朗读：--skip-intro-sec 秒 或 --skip-segments N 段
3) 片段 → 音素映射：
   - 默认"顺序映射"：剩余片段按 --order-file（或内置教学顺序）依次对应音素
   - --unit-utterances 2：每个音素单元含 2 段（音标 + 样例词），成对归组
4) （可选）--asr 用 faster-whisper 转写每段文本，写入 review 供人工核对/排查
5) 先产出 review.json（dry-run，默认）；确认无误后加 --apply 真正剪辑落盘
6) 按 phoneme.json 命名输出 mp3，并回写 phoneme.json、metadata.json

依赖
----
- ffmpeg / ffprobe（必需，需在 PATH 中或用 --ffmpeg/--ffprobe 指定）
- faster-whisper（可选，仅 --asr 时）：pip install faster-whisper

用法示例
--------
  # 1) 先 dry-run 看分段与映射是否正确
  python tools/extract_phoneme_audio.py --input 发音.mp3 --skip-intro-sec 8
  # 2) 调参（静音阈值/最短静音）直到片段数与音素数匹配
  python tools/extract_phoneme_audio.py --input 发音.mp3 --noise -32dB --min-silence 0.45
  # 3) 确认后落盘
  python tools/extract_phoneme_audio.py --input 发音.mp3 --skip-intro-sec 8 --apply
"""
from __future__ import annotations

import argparse
import json
import os
import re
import subprocess
import sys
from dataclasses import dataclass, asdict, field
from pathlib import Path
from typing import Optional

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from app import config

# 录音默认音素顺序（教学顺序，与 video-cut DEFAULT_PHONEMES 一致）
DEFAULT_PHONEME_ORDER = [
    "iː", "ɪ", "e", "æ", "ɑː", "ɒ", "ɔː", "uː", "ʊ", "ʌ", "ə", "ɜː",
    "eɪ", "aɪ", "ɔɪ", "əʊ", "aʊ", "ɪə", "eə", "i",
    "p", "b", "t", "d", "k", "g", "f", "v", "θ", "ð", "s", "z", "ʃ", "dʒ", "tʃ",
    "h", "m", "n", "ŋ", "l", "r", "w", "j", "ks", "kw",
]


@dataclass
class Segment:
    start: float
    end: float

    @property
    def duration(self) -> float:
        return round(self.end - self.start, 3)


@dataclass
class Unit:
    """一个音素单元的映射结果。"""
    index: int                 # 序号（从 1 起）
    ipa: str                   # 音素 IPA
    file: str                  # 输出文件名（NN.mp3）
    start: float               # 剪辑起点（秒）
    end: float                 # 剪辑终点（秒）
    transcript: str = ""       # ASR 转写（可选，供人工核对）
    note: str = ""


# ---------------------------------------------------------------------------
# ffmpeg / ffprobe 封装
# ---------------------------------------------------------------------------

def run(cmd: list[str]) -> subprocess.CompletedProcess:
    return subprocess.run(cmd, capture_output=True, text=True)


def probe_duration(ffprobe: str, audio: Path) -> float:
    r = run([
        ffprobe, "-v", "error", "-show_entries", "format=duration",
        "-of", "default=noprint_wrappers=1:nokey=1", str(audio),
    ])
    try:
        return float(r.stdout.strip())
    except ValueError:
        raise SystemExit(f"无法解析音频时长，请检查 ffprobe 与文件：{audio}\n{r.stderr}")


def detect_silence(ffmpeg: str, audio: Path, noise: str, min_silence: float) -> list[tuple[float, float]]:
    """返回静音区间列表 [(silence_start, silence_end), ...]。"""
    r = run([
        ffmpeg, "-i", str(audio),
        "-af", f"silencedetect=noise={noise}:d={min_silence}",
        "-f", "null", "-",
    ])
    text = r.stderr
    starts = [float(m) for m in re.findall(r"silence_start:\s*([0-9.]+)", text)]
    ends = [float(m) for m in re.findall(r"silence_end:\s*([0-9.]+)", text)]
    silences: list[tuple[float, float]] = []
    for i, s in enumerate(starts):
        e = ends[i] if i < len(ends) else s
        silences.append((s, e))
    return silences


def silences_to_segments(silences: list[tuple[float, float]], duration: float,
                         min_seg: float) -> list[Segment]:
    """由静音区间反推说话片段（静音的补集），过滤过短片段。"""
    segs: list[Segment] = []
    cursor = 0.0
    for (s, e) in silences:
        if s - cursor >= min_seg:
            segs.append(Segment(round(cursor, 3), round(s, 3)))
        cursor = e
    if duration - cursor >= min_seg:
        segs.append(Segment(round(cursor, 3), round(duration, 3)))
    return segs


# ---------------------------------------------------------------------------
# 可选：ASR 转写（faster-whisper），仅用于人工核对/排查
# ---------------------------------------------------------------------------

def transcribe(audio: Path, segs: list[Segment], model_size: str) -> list[str]:
    try:
        from faster_whisper import WhisperModel  # type: ignore
    except ImportError:
        print("[警告] 未安装 faster-whisper，跳过 ASR。pip install faster-whisper", file=sys.stderr)
        return ["" for _ in segs]
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    texts: list[str] = []
    for seg in segs:
        # faster-whisper 支持 clip_timestamps，逐段转写更稳妥这里直接整体转写再按时间归并
        segments, _ = model.transcribe(str(audio), language="en",
                                       clip_timestamps=[seg.start, seg.end])
        texts.append(" ".join(s.text.strip() for s in segments).strip())
    return texts


# ---------------------------------------------------------------------------
# 音素映射表（phoneme.json: IPA -> NN.mp3）
# ---------------------------------------------------------------------------

def load_phoneme_map() -> dict[str, str]:
    f = config.PHONEME_MAP_FILE
    if f.exists():
        with open(f, "r", encoding="utf-8") as fh:
            return json.load(fh)
    return {}


def next_index(phoneme_map: dict[str, str]) -> int:
    used = []
    for v in phoneme_map.values():
        m = re.match(r"(\d+)\.mp3$", v)
        if m:
            used.append(int(m.group(1)))
    return (max(used) + 1) if used else 1


def resolve_filename(ipa: str, phoneme_map: dict[str, str]) -> str:
    """返回该 IPA 的输出文件名；不存在则分配新编号并写入映射。"""
    if ipa in phoneme_map:
        return phoneme_map[ipa]
    idx = next_index(phoneme_map)
    fname = f"{idx:02d}.mp3"
    phoneme_map[ipa] = fname
    return fname


# ---------------------------------------------------------------------------
# 主流程
# ---------------------------------------------------------------------------

def load_order(order_file: Optional[str]) -> list[str]:
    if order_file:
        lines = Path(order_file).read_text(encoding="utf-8").splitlines()
        return [ln.strip() for ln in lines if ln.strip()]
    return list(DEFAULT_PHONEME_ORDER)


def build_units(segs: list[Segment], order: list[str], phoneme_map: dict[str, str],
                unit_utterances: int, clip_mode: str, pad_ms: int,
                duration: float) -> list[Unit]:
    """把片段按顺序映射到音素，生成剪辑单元。"""
    pad = pad_ms / 1000.0
    units: list[Unit] = []
    step = max(1, unit_utterances)
    for i, ipa in enumerate(order):
        base = i * step
        if base >= len(segs):
            break
        first = segs[base]
        last = segs[min(base + step - 1, len(segs) - 1)]
        if clip_mode == "ipa":
            # 仅取音标朗读（单元内第 1 段）
            start, end = first.start, first.end
        else:  # "unit"：音标到样例词整体
            start, end = first.start, last.end
        start = max(0.0, start - pad)
        end = min(duration, end + pad)
        fname = resolve_filename(ipa, phoneme_map)
        units.append(Unit(index=i + 1, ipa=ipa, file=fname,
                          start=round(start, 3), end=round(end, 3)))
    return units


def extract_clip(ffmpeg: str, audio: Path, out: Path, start: float, end: float) -> bool:
    out.parent.mkdir(parents=True, exist_ok=True)
    r = run([
        ffmpeg, "-y", "-ss", f"{start:.3f}", "-to", f"{end:.3f}", "-i", str(audio),
        "-vn", "-acodec", "libmp3lame", "-b:a", "192k", str(out),
    ])
    return r.returncode == 0


def main():
    ap = argparse.ArgumentParser(description="音素发音自动提取（静音切分 + 可选ASR + 自动剪辑）")
    ap.add_argument("--input", required=True, help="输入音频/视频文件")
    ap.add_argument("--out-dir", default=str(config.PHONEMES_AUDIO_DIR),
                    help="输出目录（默认 config.PHONEMES_AUDIO_DIR）")
    ap.add_argument("--noise", default="-30dB", help="静音判定阈值，默认 -30dB（环境吵可调到 -35dB）")
    ap.add_argument("--min-silence", type=float, default=0.4, help="最短静音时长(秒)，默认0.4")
    ap.add_argument("--min-seg", type=float, default=0.2, help="最短有效片段(秒)，过滤杂音，默认0.2")
    ap.add_argument("--skip-intro-sec", type=float, default=0.0, help="跳过开头测试朗读的秒数")
    ap.add_argument("--skip-segments", type=int, default=0, help="跳过开头的片段数（与--skip-intro-sec二选一）")
    ap.add_argument("--unit-utterances", type=int, default=1, choices=[1, 2],
                    help="每个音素单元包含几段：1=音标(可能含词)；2=音标段+样例词段")
    ap.add_argument("--clip", dest="clip_mode", default="unit", choices=["ipa", "unit"],
                    help="剪辑范围：ipa=仅音标段；unit=音标到样例词整体（默认）")
    ap.add_argument("--pad-ms", type=int, default=80, help="片段前后留白(毫秒)，默认80")
    ap.add_argument("--order-file", default=None, help="音素顺序文件(每行一个IPA)，默认内置教学顺序")
    ap.add_argument("--asr", action="store_true", help="启用 faster-whisper 转写(仅供review核对)")
    ap.add_argument("--asr-model", default="small", help="whisper 模型大小，默认 small")
    ap.add_argument("--ffmpeg", default="ffmpeg", help="ffmpeg 可执行路径")
    ap.add_argument("--ffprobe", default="ffprobe", help="ffprobe 可执行路径")
    ap.add_argument("--review", default=None, help="review.json 输出路径，默认 <out-dir>/review.json")
    ap.add_argument("--apply", action="store_true", help="真正剪辑落盘（默认仅 dry-run 生成 review）")
    args = ap.parse_args()

    audio = Path(args.input)
    if not audio.exists():
        raise SystemExit(f"输入文件不存在：{audio}")
    out_dir = Path(args.out_dir)
    review_path = Path(args.review) if args.review else out_dir / "review.json"

    print(f"[1/5] 探测时长 ...")
    duration = probe_duration(args.ffprobe, audio)
    print(f"      时长 {duration:.2f}s")

    print(f"[2/5] 静音切分 (noise={args.noise}, d={args.min_silence}) ...")
    silences = detect_silence(args.ffmpeg, audio, args.noise, args.min_silence)
    segs = silences_to_segments(silences, duration, args.min_seg)
    print(f"      检出 {len(segs)} 个说话片段")

    # 跳过开头
    if args.skip_segments > 0:
        segs = segs[args.skip_segments:]
        print(f"      跳过开头 {args.skip_segments} 段，剩余 {len(segs)}")
    elif args.skip_intro_sec > 0:
        before = len(segs)
        segs = [s for s in segs if s.start >= args.skip_intro_sec]
        print(f"      跳过 < {args.skip_intro_sec}s 的开头，{before} -> {len(segs)} 段")

    # ASR（可选）
    transcripts = transcribe(audio, segs, args.asr_model) if args.asr else ["" for _ in segs]

    print(f"[3/5] 顺序映射音素 (unit_utterances={args.unit_utterances}, clip={args.clip_mode}) ...")
    order = load_order(args.order_file)
    phoneme_map = load_phoneme_map()
    units = build_units(segs, order, phoneme_map, args.unit_utterances,
                        args.clip_mode, args.pad_ms, duration)
    # 回填 transcript（按单元首段对应的片段）
    if args.asr:
        step = max(1, args.unit_utterances)
        for u in units:
            si = (u.index - 1) * step
            if si < len(transcripts):
                u.transcript = transcripts[si]

    expected = len(order)
    got = len(units)
    if got != expected:
        print(f"      [注意] 映射到 {got} 个音素，期望 {expected} 个。"
              f"片段数与音素数不匹配，建议调整 --noise/--min-silence/--skip-* 后重试。")

    print(f"[4/5] 写出 review：{review_path}")
    review = {
        "input": str(audio),
        "duration": round(duration, 3),
        "out_dir": str(out_dir),
        "params": {
            "noise": args.noise, "min_silence": args.min_silence,
            "min_seg": args.min_seg, "skip_intro_sec": args.skip_intro_sec,
            "skip_segments": args.skip_segments, "unit_utterances": args.unit_utterances,
            "clip": args.clip_mode, "pad_ms": args.pad_ms,
        },
        "segment_count": len(segs),
        "units": [asdict(u) for u in units],
    }
    out_dir.mkdir(parents=True, exist_ok=True)
    with open(review_path, "w", encoding="utf-8") as fh:
        json.dump(review, fh, ensure_ascii=False, indent=2)

    if not args.apply:
        print("\n[dry-run] 未落盘。请检查 review.json 的映射与时间，确认后加 --apply 执行剪辑。")
        return

    print(f"[5/5] 剪辑落盘到 {out_dir} ...")
    ok, fail = 0, 0
    for u in units:
        if extract_clip(args.ffmpeg, audio, out_dir / u.file, u.start, u.end):
            ok += 1
        else:
            fail += 1
            print(f"      [失败] {u.ipa} -> {u.file}")
    # 回写 phoneme.json（可能新增了音素编号）
    with open(config.PHONEME_MAP_FILE, "w", encoding="utf-8") as fh:
        json.dump(phoneme_map, fh, ensure_ascii=False, indent=2)
    # 输出元数据
    with open(out_dir / "metadata.json", "w", encoding="utf-8") as fh:
        json.dump([asdict(u) for u in units], fh, ensure_ascii=False, indent=2)
    print(f"      完成：成功 {ok}，失败 {fail}。已更新 phoneme.json 与 metadata.json")


if __name__ == "__main__":
    main()
